# Import necessary libraries

In [None]:
import json
import nltk
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
import glob
import seaborn as sns
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jakerothstein/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Step 1: Preparing the Text Data

In [2]:
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def tokenize_text(text):
    #make all text lowercase, into a single line, and tokenize into sentences
    return [re.sub(r'\n', ' ', s).lower() for s in sent_tokenize(text)]

# Define keyword lists

In [3]:
keywords = {
    'attack': {'relevant': ['cyber-', 'cyber', 'networks', 'systems', 'products', 'services', 'datacenter', 'infrastructure'],
               'irrelevant': ['terror', 'war', 'contraband', 'bombs']},
    'threat': {'relevant': ['cyber-', 'cyber', 'networks', 'systems', 'products', 'services', 'datacenter', 'infrastructure'],
               'irrelevant': ['terror', 'simulator', 'disease', 'legal action', 'competitors']},
    'computer': {'relevant': ['malware', 'virus', 'viruses', 'intrusions'],
                 'irrelevant': ['fires', 'product sales', 'warranty claim']},
    'information': {'relevant': ['malware', 'virus', 'viruses', 'intrusions'],
                    'irrelevant': ['fires', 'product sales', 'warranty claim']},
    'system': {'relevant': ['malware', 'virus', 'viruses', 'intrusions'],
               'irrelevant': ['fires', 'product sales', 'warranty claim']},
    'malicious': {'relevant': ['software', 'programs', 'third parties', 'attacks'],
                  'irrelevant': []},
    'breaches': {'relevant': [],
                 'irrelevant': ['fiduciary duty', 'fiduciary duties', 'covenant', 'credit', 'agreement']},
    'hacker': {'relevant': [], 'irrelevant': ['fiduciary', 'warranty', 'regulations', 'contract']},
    'hacking': {'relevant': [], 'irrelevant': ['fiduciary', 'warranty', 'regulations', 'contract']},
    'social engineering': {'relevant': [], 'irrelevant': ['fiduciary', 'warranty', 'regulations', 'contract']},
    'denial of service': {'relevant': [], 'irrelevant': ['fiduciary', 'warranty', 'regulations', 'contract']},
    'cyberattack': {'relevant': [], 'irrelevant': ['fiduciary', 'warranty', 'regulations', 'contract']},
    'cybersecurity': {'relevant': [], 'irrelevant': ['fiduciary', 'warranty', 'regulations', 'contract']}
}

def get_industry(sic_code):
    if 1 <= sic_code <= 999:
        return 'Agriculture, Forestry and Fishing'
    elif 1000 <= sic_code <= 1499:
        return 'Mining'
    elif 1500 <= sic_code <= 1799:
        return 'Construction'
    elif 2000 <= sic_code <= 3999:
        return 'Manufacturing'
    elif 4000 <= sic_code <= 4999:
        return 'Transportation and other Utilities'
    elif 5000 <= sic_code <= 5199:
        return 'Wholesale Trade'
    elif 5200 <= sic_code <= 5999:
        return 'Retail Trade'
    elif 6000 <= sic_code <= 6799:
        return 'Finance, Insurance and Real Estate'
    elif 7000 <= sic_code <= 8999:
        return 'Services'
    elif 9000 <= sic_code <= 9999:
        return 'Public Administration'
    else:
        return 'Unknown'


def is_relevant_sentence(sentence):
    sentence = sentence
    for keyword, conditions in keywords.items():
        if keyword in sentence:
            if not conditions['relevant'] and not conditions['irrelevant']:
                return True
            for rel in conditions['relevant']:
                if rel in sentence:
                    return True
            for irr in conditions['irrelevant']:
                if irr in sentence:
                    return False
    return False

def extract_relevant_sentences(sentences):
    return [s for s in sentences if is_relevant_sentence(s)]

# Step 3: Constructing a Measure of Cybersecurity Risk

In [4]:
def calculate_cybersecurity_risk(relevant_sentences, total_sentences):
    # Option 1: Ratio of relevant sentences to total sentences
    return len(relevant_sentences) / len(total_sentences)

# Step 4: Descriptive Analysis


In [5]:
def plot_risk_distribution(df):
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='industry', y='risk_measure', data=df)
    plt.xticks(rotation=45, ha='right')
    plt.title('Distribution of Cybersecurity Risk Measure by Industry')
    plt.tight_layout()
    plt.savefig('risk_distribution_by_industry.png')
    plt.close()

def plot_risk_over_time(df):
    plt.figure(figsize=(12, 6))
    for industry in df['industry'].unique():
        industry_data = df[df['industry'] == industry]
        yearly_stats = industry_data.groupby('year')['risk_measure'].agg(['mean', 'std'])
        plt.errorbar(yearly_stats.index, yearly_stats['mean'], yerr=yearly_stats['std'], label=industry, capsize=5)
    
    plt.xlabel('Year')
    plt.ylabel('Cybersecurity Risk Measure')
    plt.title('Cybersecurity Risk Measure by Industry (2015-2023)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig('cybersecurity_risk_over_time.png')
    plt.close()

def plot_heatmap(df):
    pivot_df = df.pivot_table(values='risk_measure', index='industry', columns='year', aggfunc='mean')
    plt.figure(figsize=(12, 8))
    sns.heatmap(pivot_df, annot=True, cmap='YlOrRd', fmt='.4f')
    plt.title('Heatmap of Cybersecurity Risk Measure by Industry and Year')
    plt.tight_layout()
    plt.savefig('risk_heatmap.png')
    plt.close()



# Step 5. Data Execution

In [6]:
def extraction(base_path='data/10K_item1a_PERMNO_2015_sic_tic'):
    all_data = []
    for file_path in glob.glob(os.path.join(base_path, '*.json')):
        data = read_json_file(file_path)
        sentences = tokenize_text(data['item_1A'])
        relevant_sentences = extract_relevant_sentences(sentences)
        risk_measure = calculate_cybersecurity_risk(relevant_sentences, sentences)
        industry = get_industry(data['SIC'])
        all_data.append({
            'year': data['filing_date'][:4],
            'company': data['company'],
            'industry': industry,
            'risk_measure': risk_measure
        })
        #write relevant sentences to file
        with open(file_path.replace('.json', '_relevant.txt'), 'w') as file:
            file.write('\n'.join(relevant_sentences))
    df = pd.DataFrame(all_data)
    df.to_csv('data/cybersecurity_risk.csv', index=False)

# Main Execution

In [7]:
import numpy as np

def compute_descriptive_stats(df):
    stats = df.groupby('industry')['risk_measure'].agg([
        'count', 'mean', 'std', 'min', 'max',
        lambda x: np.percentile(x, 1),
        lambda x: np.percentile(x, 5),
        lambda x: np.percentile(x, 25),
        lambda x: np.percentile(x, 50),
        lambda x: np.percentile(x, 75),
        lambda x: np.percentile(x, 95),
        lambda x: np.percentile(x, 99),
        lambda x: x.skew(),
        lambda x: x.kurtosis()
    ]).rename(columns={
        'count': 'N',
        '<lambda_0>': '1%',
        '<lambda_1>': '5%',
        '<lambda_2>': '25%',
        '<lambda_3>': '50%',
        '<lambda_4>': '75%',
        '<lambda_5>': '95%',
        '<lambda_6>': '99%',
        '<lambda_7>': 'skewness',
        '<lambda_8>': 'kurtosis'
    })
    return stats

def main():
    extraction()
    df = pd.read_csv('data/cybersecurity_risk.csv')
    
    # Compute descriptive statistics
    stats = compute_descriptive_stats(df)
    print("Descriptive Statistics:")
    print(stats)
    
    # Plot risk distribution
    plot_risk_distribution(df)
    
    # Plot risk measure over time
    plot_risk_over_time(df)
    
    # Plot heatmap
    plot_heatmap(df)
    
    print("Analysis complete. Check the generated PNG files for the plots.")

if __name__ == "__main__":
    main()
    # extraction()



Descriptive Statistics:
                                        N      mean       std  min       max  \
industry                                                                       
Agriculture, Forestry and Fishing      55  0.009914  0.007135  0.0  0.034286   
Construction                          394  0.007577  0.007033  0.0  0.033019   
Finance, Insurance and Real Estate   5543  0.014278  0.012746  0.0  0.142857   
Manufacturing                       12610  0.008971  0.009617  0.0  0.097345   
Mining                               1333  0.011078  0.010969  0.0  0.085714   
Public Administration                 311  0.006138  0.010666  0.0  0.067961   
Retail Trade                         1562  0.011689  0.010007  0.0  0.066667   
Services                             4840  0.012245  0.010453  0.0  0.076336   
Transportation and other Utilities   1901  0.014463  0.012439  0.0  0.104762   
Wholesale Trade                       761  0.012470  0.011593  0.0  0.069444   

               

Descriptive Statistics:
                                        N      mean       std  min       max  \
industry                                                                       
Agriculture, Forestry and Fishing      55  0.009914  0.007135  0.0  0.034286   
Construction                          394  0.007577  0.007033  0.0  0.033019   
Finance, Insurance and Real Estate   5543  0.014278  0.012746  0.0  0.142857   
Manufacturing                       12610  0.008971  0.009617  0.0  0.097345   
Mining                               1333  0.011078  0.010969  0.0  0.085714   
Public Administration                 311  0.006138  0.010666  0.0  0.067961   
Retail Trade                         1562  0.011689  0.010007  0.0  0.066667   
Services                             4840  0.012245  0.010453  0.0  0.076336   
Transportation and other Utilities   1901  0.014463  0.012439  0.0  0.104762   
Wholesale Trade                       761  0.012470  0.011593  0.0  0.069444   

                                     1%   5%       25%       50%       75%  \
industry                                                                     
Agriculture, Forestry and Fishing   0.0  0.0  0.006702  0.009585  0.012248   
Construction                        0.0  0.0  0.002488  0.006025  0.010974   
Finance, Insurance and Real Estate  0.0  0.0  0.004943  0.011278  0.021453   
Manufacturing                       0.0  0.0  0.002413  0.006150  0.012658   
Mining                              0.0  0.0  0.002762  0.009646  0.015773   
Public Administration               0.0  0.0  0.000000  0.002137  0.006218   
Retail Trade                        0.0  0.0  0.004571  0.009615  0.016393   
Services                            0.0  0.0  0.005263  0.010230  0.016434   
Transportation and other Utilities  0.0  0.0  0.005405  0.011940  0.020408   
Wholesale Trade                     0.0  0.0  0.004115  0.010101  0.017241   

                                         95%       99%  skewness   kurtosis  
industry                                                                     
Agriculture, Forestry and Fishing   0.025449  0.029982  1.137781   2.341089  
Construction                        0.021984  0.028164  1.106130   0.732433  
Finance, Insurance and Real Estate  0.037736  0.052766  1.727283   7.872448  
Manufacturing                       0.027972  0.044192  2.076123   6.821890  
Mining                              0.030543  0.050666  2.012681   7.078396  
Public Administration               0.027525  0.056685  3.134317  12.251157  
Retail Trade                        0.029846  0.048006  1.547352   3.707941  
Services                            0.032896  0.049476  1.622548   3.894056  
Transportation and other Utilities  0.036036  0.054237  1.928454   7.275814  
Wholesale Trade                     0.037313  0.047617  1.396948   2.301445