In [15]:
#!pip install spacy

In [16]:
#!pip install scattertext

In [1]:
import pandas as pd
import spacy
import scattertext as st
import numpy as np
import sys
import codecs
sys.stdout = codecs.getwriter('utf8')(sys.stdout)

In [2]:
df = pd.read_csv('KUKSAT_Coded_Baseline.csv', encoding='ISO-8859-1')
df

Unnamed: 0,Type,Code,KUKSAT
0,KU,CSF,"Threats and Adversaries (threat actors, malwar..."
1,KU,CSF,Vulnerabilities and Risk management (include b...
2,KU,CSF,Common Attacks
3,KU,CSF,Basic Risk Assessment
4,KU,CSF,Security Life-Cycle
...,...,...,...
2258,KSAT,T1003,"Work with security managers (i.e., system owne..."
2259,KSAT,T1004,Use continuous monitoring tools to assess risk...
2260,KSAT,T1005,Use the continuous monitoring data to make inf...
2261,KSAT,T1006,Respond to issues flagged during continuous mo...


In [3]:
#!python -m spacy download en_core_web_sm

In [4]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df['KUKSAT'] = df['KUKSAT'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gibso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df

Unnamed: 0,Type,Code,KUKSAT
0,KU,CSF,"Threats Adversaries (threat actors, malware, n..."
1,KU,CSF,Vulnerabilities Risk management (include backu...
2,KU,CSF,Common Attacks
3,KU,CSF,Basic Risk Assessment
4,KU,CSF,Security Life-Cycle
...,...,...,...
2258,KSAT,T1003,"Work security managers (i.e., system owners, i..."
2259,KSAT,T1004,Use continuous monitoring tools assess risk on...
2260,KSAT,T1005,Use continuous monitoring data make informatio...
2261,KSAT,T1006,"Respond issues flagged continuous monitoring, ..."


In [6]:
nlp = spacy.load('en_core_web_sm')
corpus = st.CorpusFromPandas(df,
                            category_col='Type',
                            text_col='KUKSAT',
                            nlp=nlp).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count=6)


In [7]:
dispersion = st.Dispersion(corpus)

dispersion_df = dispersion.get_df()
np.seterr(invalid='ignore')
dispersion_df.head(6)

  self.p = X.multiply(csc_matrix(1. / X.sum(axis=1)))
  vfs = vf.multiply(1. / self.s)


Unnamed: 0,Frequency,Range,SD,VC,Juilland's D,Rosengren's S,DP,DP norm,KL-divergence
threats,18,18,0.08883,11.16791,0.742248,0.007547,0.991908,0.991908,7.145019
threat,44,43,0.141241,7.264271,0.828201,0.018655,0.979832,0.979832,5.849059
malware,21,17,0.124016,13.36425,0.707316,0.005588,0.993651,0.993651,7.637239
vulnerabilities,43,43,0.136529,7.185256,0.829483,0.022219,0.976035,0.976035,5.592705
risk,103,89,0.238118,5.23165,0.87736,0.049738,0.947152,0.947152,4.422927
management,162,145,0.317694,4.437907,0.894734,0.075382,0.917585,0.917585,3.870084


In [8]:
dispersion_df = dispersion_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df["Rosengren's S"],
    Ypos=lambda df: st.Scalers.scale(df.Y),
)

In [9]:
html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['Type'] + ' (' + corpus.get_df()['Code'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label="Rosengren's S",
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
)


In [10]:
from sklearn.neighbors import KNeighborsRegressor

dispersion_df = dispersion_df.assign(
    Expected=lambda df: KNeighborsRegressor(n_neighbors=10).fit(
        df.X.values.reshape(-1, 1), df.Y
    ).predict(df.X.values.reshape(-1, 1)),
    Residual=lambda df: df.Y - df.Expected,
    ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual)
)  

In [11]:
html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['Type'] + ' (' + corpus.get_df()['Code'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label="Rosengren's S",
    y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
    color_score_column='ColorScore',
    header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
    left_list_column='Residual',
    background_color='#e5e5e3'
)

In [12]:
# Saves the html file to be opened in web browser

open('./demo_compact.html', 'w', encoding='UTF-8').write(html)

997167