In [1]:
### CYBER RISK INDEX CALCULATION

In [2]:
### INITIALIZATION

import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats ### To Annotate Fliers
import seaborn as sns
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import zipfile
import io
import re

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
### VERSION CONTROL

from platform import python_version
print('python version: ', python_version())
print('numpy version: ', np.__version__)
print('pandas version: ', pd.__version__)

python version:  3.11.5
numpy version:  1.26.3
pandas version:  2.1.4


In [4]:
### PARAMETERS

### MultiIndex level Slice Constant:
no_filter = slice(None)
### NA for MS Excel Files:
list_na_excel_values = ['', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NULL', 'NaN', 'n/a', 'nan', 'null', '#N/A Requesting Data...', '#N/A Invalid Security', '#N/A Field Not Applicable']
### Request Header:
dict_header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
### Path to Selenium:
str_chromedriver_path = 'D:/Distribs/Python toolboxes/Selenium/chromedriver.exe'
### Data Loading: HACK ETF Holdings:
str_htf_link = 'https://amplifyetfs.com/hack-holdings/'
### Data Loading: Convertation from SIC Classificator to Fama-French Industries:
str_ff_converter_link = 'https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes12.zip'
str_ff_txt_path = 'Data_Files/Source_Files/Siccodes12.txt'
### Data Loading: MITTRE ATT&CK Tactics:
str_mitre_tactics_link = 'https://attack.mitre.org/tactics/enterprise/'
str_mitre_tech_link = 'https://attack.mitre.org/tactics/'
### Data Loading:
str_sec_10_k_index_link = 'https://www.sec.gov/Archives/edgar/full-index/company.zip'
str_10_k_index_txt_path = 'Data_Files/Source_Files/company.idx'
str_sec_archive_prefix = 'https://sec.gov/Archives/'
### Data saving:
str_dataset_key = 'dataset'
str_htf_holdings_path = 'Data_Files/Source_Files/HTF_Holdings.h5'
str_sic_to_ff_path = 'Data_Files/Source_Files/CIS_FF_Converter.h5'
str_mitre_sub_path = 'Data_Files/Source_Files/MITRE_Sub_Techniques.h5'
str_10_k_index_path = 'Data_Files/Source_Files/SEC_10_k_Index.h5'
str_10_k_par_path = 'Data_Files/Source_Files/SEC_10_k_Paragraphs.h5'

In [None]:
### HTF HOLDING LIST PARSING

### Web-Driver Preparation:
obj_service = Service(str_chromedriver_path)
driver = webdriver.Chrome(executable_path=str_chromedriver_path)
driver.maximize_window()
### Get Response from website:
driver.get(str_htf_link)
obj_htf_soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
### Parse Holdings Table Header:
list_htf_header = []
for tag_i in obj_htf_soup.find('thead').find('tr').find_all('th'):
    list_htf_header.append(tag_i.text)
### Parse Holdings Table Body:
list_htf_body = []
for tag_i in obj_htf_soup.find('tbody').find_all('tr'):
    list_htf_body.append([tag_j.text for tag_j in tag_i.find_all('td')])

In [None]:
### HTF HOLDING LIST PREPROCESSING

### Data Processing:
df_htf_holdings = pd.DataFrame(list_htf_body, columns=(list_htf_header[: -1] + ['%'])).set_index('Name')
df_htf_holdings['Shares'] = df_htf_holdings['Shares'].str.replace(',', '').astype(int)
df_htf_holdings['Market_Value'] = df_htf_holdings['Market Value'].str.replace(',', '').str.replace('$', '').astype(int)
df_htf_holdings.drop('Market Value', axis=1, inplace=True)
df_htf_holdings['%'] = df_htf_holdings['%'].str.replace('%', '').astype('float32') / 100
### Control Output:
print('HDF Holding DataFrame Head:\n{}\n'.format(df_htf_holdings.head()))
print('HDF Holding DataFrame Types:\n{}\n'.format(df_htf_holdings.dtypes))
print('HDF Holding DataFrame Check Sum:\n{:.2f}\n'.format(df_htf_holdings['%'].sum()))
### Results Saving:
df_htf_holdings.to_hdf(str_htf_holdings_path, key=str_dataset_key, mode='w')

In [7]:
### CIS TO FAMA-FRENCH CONVERTER LOADING

### Function to Expand Range to List:
def expand_range(str_range):
    int_start = int(str_range.split('-')[0])
    int_end = int(str_range.split('-')[1])
    list_expanded = list(map(lambda int_i: str(int_i).zfill(4), range(int_start, int_end + 1)))
    return list_expanded
### Archive Loading & UnPacking:
obj_response = requests.get(str_ff_converter_link, stream=True)
file_zipped = zipfile.ZipFile(io.BytesIO(obj_response.content))
file_zipped.extractall(str_ff_txt_path.rpartition('/')[0])
### File Reading:
with open(str_ff_txt_path) as file_ff_txt:
    list_ff_lines = [line.rstrip().lstrip() for line in file_ff_txt if (len(line.rstrip().lstrip()) > 0)]
### Line Data Convertation:
dict_names = {}
dict_codes_ranged = {}
for str_line_i in list_ff_lines:
    list_splitted = str_line_i.split(' ')
    if (len(list_splitted) > 1):
        str_ind_num = list_splitted[0]
        str_ind_name = list_splitted[1]
        dict_names[str_ind_num] = str_ind_name       
        dict_codes_ranged[str_ind_num] = []
    else:
        dict_codes_ranged[str_ind_num].append(str_line_i)
### SIC Ranges to SIC lists:
dict_series_expanded = {}
for str_code_i in dict_codes_ranged:
    list_collection = []
    for str_range_i in dict_codes_ranged[str_code_i]:
        list_collection.extend(expand_range(str_range_i))
    dict_series_expanded[str_code_i] = pd.Series(list_collection, name='SIC_Code')
### Data Aggregation:
df_sic_ff_converter = pd.concat(dict_series_expanded, axis=0, names=['FF_Code']).droplevel(1).to_frame().join(pd.Series(dict_names, name='FF_Name')).sort_index()
df_sic_ff_converter = df_sic_ff_converter.reset_index().set_index('SIC_Code').sort_index()
### Results Saving:
df_sic_ff_converter.to_hdf(str_sic_to_ff_path, key=str_dataset_key, mode='w')

In [8]:
### MITRE TACTICS PARSING

### Tactics Table Loading:
obj_mitre_soup_tactics = BeautifulSoup(requests.get(str_mitre_tactics_link).text, 'html.parser')
### Head Extrtacting:
list_tactics_head = []
for tag_i in obj_mitre_soup_tactics.find('table').find('thead').find_all('th'):
    list_tactics_head.append(tag_i.text)
### Body Extrtacting:   
list_tactics_body = []
for tag_i in obj_mitre_soup_tactics.find('table').find('tbody').find_all('tr'):
#    print(tag_i)
    list_i_tactics = []
    for tag_j in tag_i.find_all('td'):
        list_i_tactics.append(tag_j.text.strip(' ').strip('\n').strip(' '))
#        print(tag_j)        
    list_tactics_body.append(list_i_tactics)
### Tactics table aggregation:
df_mitre_tactics = pd.DataFrame(list_tactics_body, columns=list_tactics_head).set_index('ID').sort_index()

In [29]:
### MITRE SUB-TECHNIQUES PARSING

### Function to Drop Technique Description (if Technique has more than 1 sub):
def check_main_only(df_group):
    df_group = df_group.droplevel('Tech_ID')
    if (len(df_group) > 1):
        return df_group.drop('000', axis=0)
    else:
        return df_group
### Head Preparation:
list_tech_head = ['Tech_ID', 'Sub_ID', 'Tech_Name', 'Sub_Name', 'Description']
### Body Preparation:   
list_tech_body = []
### Techniques Tables Loading:
for id_i in df_mitre_tactics.index:
    obj_mitre_soup_tech = BeautifulSoup(requests.get(str_mitre_tech_link + id_i + '/').text, 'html.parser')
    ### Sun-Technique Tables Extraction:
    for tag_i in obj_mitre_soup_tech.find('table').find('tbody').find_all('tr'):
#        print(tag_i)
        list_i_tech = []
        for tag_j in tag_i.find_all('td'):
#            print(tag_j)
            list_i_tech.append(tag_j.text.strip(' ').strip('\n').strip(' '))
        ### Table Modification to Unificate Structure:
        if (len(list_i_tech) == 3):
            str_tech_id = list_i_tech[0]
            str_tech_name = list_i_tech[1]
            list_i_tech.insert(1, '000')
            list_i_tech.insert(3, 'Root')
        else:
            list_i_tech[0] = str_tech_id
            list_i_tech[1] = list_i_tech[1][1 :]
            list_i_tech.insert(2, str_tech_name)
        ### Row Collecting:
        list_tech_body.append(list_i_tech)    
#    break
### Data Aggregation:
df_mitre_subs = pd.DataFrame(list_tech_body, columns=list_tech_head).set_index(['Tech_ID', 'Sub_ID']).sort_index()
### Killing Technique Description (if Technique has more than 1 sub)
df_mitre_subs = df_mitre_subs.groupby(['Tech_ID'], observed=True).apply(check_main_only)
### Results Saving:
df_mitre_subs.to_hdf(str_mitre_sub_path, key=str_dataset_key, mode='w')

In [17]:
### SEC 10-K LIST LOADING

### Archive Loading & UnPacking:
obj_response = requests.get(str_sec_10_k_index_link, headers=dict_header, stream=True)
file_zipped = zipfile.ZipFile(io.BytesIO(obj_response.content))
file_zipped.extractall(str_10_k_index_txt_path.rpartition('/')[0])
### File Reading:
list_10_k_lines = []
str_head_pattern = r'\-{10,}'
str_body_pattern = r'(?P<Name>.+?)\s{2,}(?P<Form>.+?)\s{2,}(?P<CIK>.+?)\s{2,}(?P<Date>.+?)\s{2,}(?P<URL>.+?)\s{2,}'
with open(str_10_k_index_txt_path) as file_10_k_txt:
    bool_body_started = False
    for str_line_i in file_10_k_txt:
        if bool_body_started:
            re_match = re.match(str_body_pattern, str_line_i)
            if (re_match.group('Form') == '10-K'):
                list_10_k_lines.append([re_match.group('Name'), re_match.group('CIK'), re_match.group('Date'), re_match.group('URL')])    
            else:
                continue
#                break
        elif (re.match(str_head_pattern, str_line_i) is None):
            continue
        else:
            bool_body_started = True
### Results saving:
df_10_k_index = pd.DataFrame(list_10_k_lines, columns = ['Name', 'CIK', 'Date', 'URL'])
df_10_k_index['Date'] = pd.to_datetime(df_10_k_index['Date'])
print(df_10_k_index.dtypes)
df_10_k_index = df_10_k_index#.set_index(['CIK', 'Date']).sort_index()
df_10_k_index.to_hdf(str_10_k_index_path, key=str_dataset_key, mode='w')

Name            object
CIK             object
Date    datetime64[ns]
URL             object
dtype: object


In [None]:
### SEC FORM 10-K PARSING

### Definition of Security Rsik Pragraph extractor:
def sec_par_extract(str_10_k_url):
    gc.collect()
    ### Loading 10-K Form:
    print(str_sec_archive_prefix  + str_10_k_url)
    try:
        obj_form_i = BeautifulSoup(requests.get(str_sec_archive_prefix  + str_10_k_url, headers=dict_header).text, 'html.parser')
    except:
        print('URL parse error')
        return None
    else:
    ### Getting attributes:
        list_10_k_head = obj_form_i.find('acceptance-datetime').text.replace('\t', '').split('\n')
        str_comp_name = [str_i.replace('COMPANY CONFORMED NAME:', '') for str_i in list_10_k_head if str_i.startswith('COMPANY CONFORMED NAME:')][0]
        str_comp_cik = [str_i.replace('CENTRAL INDEX KEY:', '') for str_i in list_10_k_head if str_i.startswith('CENTRAL INDEX KEY:')][0]
        str_comp_date = [str_i.replace('CONFORMED PERIOD OF REPORT:', '') for str_i in list_10_k_head if str_i.startswith('CONFORMED PERIOD OF REPORT:')][0]
#        print('Mark 10')
        ### Cyber Risk Paragraphs parsing:
        list_pars = []
#        tag_item_1c = obj_form_i.find(name=['span', 'p'], string=re.compile(r'.*1c\.\s*cybersecurity.*', re.IGNORECASE))
        tag_item_1c = obj_form_i.find(name='a', string=re.compile(r'.*ybersecurity.*', re.IGNORECASE)).next\
                                .find_next(string=re.compile(r'.*1c\.\s*cybersecurity.*', re.IGNORECASE))  
        tag_item_1c_par = tag_item_1c.parent
#        print(tag_item_1c_par)
#        print('Mark 20')        
        while not (tag_item_1c_par.name in ['p', 'div']):
            tag_item_1c_par = tag_item_1c_par.parent
#        tag_item_2 = obj_form_i.find(name=['span', 'p'], string=re.compile(r'.*item\s*2\.\s*Propert.*', re.IGNORECASE))
        tag_item_2 = obj_form_i.find(name='a', string=re.compile(r'.*Propert*', re.IGNORECASE)).next\
                               .find_next(string=re.compile(r'.*item\s*2\.\s*Propert.*', re.IGNORECASE))        
        tag_item_2_par = tag_item_2.parent
#        print(tag_item_2_par)
#        print('Mark 30')              
        while not (tag_item_2_par.name in ['p', 'div']):
            tag_item_2_par = tag_item_2_par.parent
        tag_next = tag_item_1c_par
#        print('Mark 40')            
        while (tag_next.next != tag_item_2_par):
            tag_next = tag_next.next
            if (len(tag_next.text.split()) > 10):
                if (tag_next.text.find(' means ') == -1):
#                    print(tag_next.text)
                    list_pars.append(tag_next.text)
#        print('Mark 50')                    
        ### Paragraphs aggregation:
        ser_i_pars = pd.Series(list_pars, index=[[str_comp_name] * len(list_pars), [str_comp_cik] * len(list_pars), [str_comp_date] * len(list_pars)])
        ser_i_pars.name = 'Text'
        ser_i_pars.index.names = ['Name', 'CIK', 'Date']
        ser_i_pars = ser_i_pars.drop_duplicates()
        ### Results output:         
        print('URL parsed successfully')
        return ser_i_pars
### SEC 10-K Forms Loading:
df_10_k_index = pd.read_hdf(str_10_k_index_path)
### Paragraphs Extraction:
ser_10_k_pars = pd.concat(map(sec_par_extract, df_10_k_index['URL'][: 11]), axis=0)
### Data saving:
ser_10_k_pars.index.set_levels(pd.to_datetime(ser_10_k_pars.index.levels[2]), level=2)
ser_10_k_pars.to_hdf(str_10_k_par_path, key=str_dataset_key, mode='w')

In [5]:
### PREPARE TRAINING SET & MODEL TRAINING

### Model creation:
model_vect = TfidfVectorizer(stop_words=list(ENGLISH_STOP_WORDS.union(['cybersecurity'])), lowercase=True,  ngram_range=(1, 4), min_df=0.05, max_df=0.50, max_features=40)
### Train data loading:
df_mitre_subs = pd.read_hdf(str_mitre_sub_path)
### Model fitting:
model_vect.fit(df_mitre_subs['Description'])
matrix_mitre_subs = model_vect.transform(df_mitre_subs['Description'])

In [6]:
### TRANSFORM TESTING SET

ser_10_k_pars = pd.read_hdf(str_10_k_par_path)
matrix_10_k_pars = model_vect.transform(ser_10_k_pars)

In [35]:
### RESULTS ANALYZING

df_10_k_cosines = pd.DataFrame(cosine_similarity(matrix_10_k_pars, matrix_mitre_subs))

ser_10_k_cosines = df_10_k_cosines.apply(lambda ser_par: ser_par.max(), axis=1)
ser_10_k_cosines.name = 'Cosine'
ser_10_k_cosines.index = ser_10_k_pars.index

ser_10_k_cosines.groupby(['Name', 'CIK', 'Date'], observed=True).apply(lambda ser_ticker: ser_ticker.nlargest(5).mean()).sort_values()

Name                                              CIK         Date      
99 Acquisition Group Inc.                         0001950429  2023-12-31    0.000000
4Front Ventures Corp.                             0001783875  2023-12-31    0.274309
1606 CORP.                                        0001877461  2023-12-31    0.619548
60 DEGREES PHARMACEUTICALS, INC.                  0001946563  2023-12-31    0.695081
ADIAL PHARMACEUTICALS, INC.                       0001513525  2023-12-31    0.762586
1847 Holdings LLC                                 0001599407  2023-12-31    0.810525
AB Commercial Real Estate Private Debt Fund, LLC  0001876255  2023-12-31    0.888397
8X8 INC /DE/                                      0001023731  2024-03-31    0.913674
ABERCROMBIE & FITCH CO /DE/                       0001018840  2024-02-03    0.933996
23andMe Holding Co.                               0001804591  2024-03-31    0.935648
Name: Cosine, dtype: float64

In [50]:
### VISUAL CONTROL

df_10_k_index = pd.read_hdf(str_10_k_index_path)
display(df_10_k_index[ : 11])

str_sec_archive_prefix + df_10_k_index.loc[0]['URL']

Unnamed: 0,Name,CIK,Date,URL
0,1606 CORP.,1877461,2024-04-17,edgar/data/1877461/0001477932-24-002182.txt
1,1847 Holdings LLC,1599407,2024-04-25,edgar/data/1599407/0001213900-24-036197.txt
2,1st FRANKLIN FINANCIAL CORP,38723,2024-04-01,edgar/data/38723/0000038723-24-000047.txt
3,23andMe Holding Co.,1804591,2024-05-30,edgar/data/1804591/0001804591-24-000038.txt
4,4Front Ventures Corp.,1783875,2024-04-15,edgar/data/1783875/0001628280-24-016208.txt
5,"60 DEGREES PHARMACEUTICALS, INC.",1946563,2024-04-01,edgar/data/1946563/0001213900-24-028577.txt
6,8X8 INC /DE/,1023731,2024-05-21,edgar/data/1023731/0001023731-24-000042.txt
7,99 Acquisition Group Inc.,1950429,2024-04-05,edgar/data/1950429/0001213900-24-030594.txt
8,"AB Commercial Real Estate Private Debt Fund, LLC",1876255,2024-04-01,edgar/data/1876255/0001193125-24-083290.txt
9,ABERCROMBIE & FITCH CO /DE/,1018840,2024-04-01,edgar/data/1018840/0001018840-24-000019.txt


'https://sec.gov/Archives/edgar/data/1877461/0001477932-24-002182.txt'

In [None]:
### TEMP

