In [1]:
# dependencies
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from collections import Counter

import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import seaborn as sns 

pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline
import multiprocessing
import time

import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
# !pip install emoji
# !pip install bs4
# from bs4 import BeautifulSoup

from wordcloud import WordCloud 

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from pprint import pprint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [2]:
df = pd.read_csv('cve_cleaned.csv', header=0, index_col=0)

In [3]:
df.drop(['cvss', 'cwe_code' ,'mod_date', 'pub_date', 'access_authentication', 'access_complexity', 'access_vector', 'impact_availability', 'impact_confidentiality', 'impact_integrity'], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,cwe_name,summary
CVE-2019-2211,Improper Neutralization of Special Elements used in an SQL Command ('SQL Injection'),"In createProjectionMapForQuery of TvProvider.java, there is possible SQL injection. This could lead to local information disclosure with no additional execution privileges needed. User interaction..."
CVE-2019-2212,Information Exposure,"In poisson_distribution of random, there is an out of bounds read. This could lead to local information disclosure with no additional execution privileges needed. User interaction is not needed fo..."
CVE-2019-2213,Use After Free,"In binder_free_transaction of binder.c, there is a possible use-after-free due to a race condition. This could lead to local escalation of privilege with no additional execution privileges needed...."
CVE-2019-2214,Improper Privilege Management,"In binder_transaction of binder.c, there is a possible out of bounds write due to a missing bounds check. This could lead to local escalation of privilege with no additional execution privileges n..."
CVE-2019-18793,Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting'),"Parallels Plesk Panel 9.5 allows XSS in target/locales/tr-TR/help/index.htm? via the ""fileName"" parameter."


In [5]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [6]:
# Convert to list
data = df.summary.values.tolist()

In [7]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

In [8]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

In [9]:
# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

***
**Tokenize words and Clean-up text**
***


In [10]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

***
**Creating Bigram and Trigram Models**
***

Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

Gensim’s Phrases model can build and implement the bigrams, trigrams, quadgrams and more. The two important arguments to Phrases are min_count and threshold. The higher the values of these param, the harder it is for words to be combined to bigrams.

In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

In [12]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

***
**Remove Stopwords, Make Bigrams and Lemmatize**
***

In [13]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [15]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [16]:
# Initialize spacy 
nlp = spacy.load("en_core_web_sm")

In [17]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

***
**Create the Dictionary and Corpus needed for Topic Modeling**
***

The main inputs to the LDA topic model are the dictionary and the corpus. 

In [18]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

In [19]:
# Create Corpus
texts = data_lemmatized

In [20]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

***
**Building the Topic Model**
***

For the LDA Model: 
- alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior.
- chunksize is the number of documents to be used in each training chunk. 
- update_every determines how often the model parameters should be updated and passes is the total number of training passes.

In [21]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [22]:
import pickle
pickle.dump(lda_model, open("lda_model.pkl", "wb"))

In [23]:
import pickle
lda_model = pickle.load(open("lda_model.pkl", "rb"))

***
**View the topics in LDA model**
***

The LDA model is built with 20 different topics where each topic is a combination of keywords and each keyword contributes a certain weight (importance) to the topic.

In [24]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.439*"arbitrary" + 0.309*"execute" + 0.189*"code" + 0.018*"attribute" + '
  '0.003*"custom" + 0.003*"modification" + 0.000*"vector" + 0.000*"buffer" + '
  '0.000*"apple" + 0.000*"dot"'),
 (1,
  '0.231*"window" + 0.071*"new" + 0.058*"firewall" + 0.053*"switch" + '
  '0.043*"valid" + 0.039*"desktop" + 0.035*"null" + 0.034*"random" + '
  '0.033*"frame" + 0.031*"virtual"'),
 (2,
  '0.290*"service" + 0.248*"cause" + 0.236*"denial" + 0.123*"crash" + '
  '0.026*"method" + 0.023*"consumption" + 0.003*"python" + 0.001*"partial" + '
  '0.001*"usage" + 0.000*"buffer"'),
 (3,
  '0.124*"security" + 0.105*"android" + 0.104*"hijack" + 0.086*"check" + '
  '0.076*"root" + 0.059*"build" + 0.058*"domain" + 0.054*"write" + '
  '0.048*"state" + 0.038*"free"'),
 (4,
  '0.226*"allow" + 0.199*"remote" + 0.190*"attacker" + 0.114*"vulnerability" + '
  '0.042*"server" + 0.037*"craft" + 0.025*"application" + 0.012*"attack" + '
  '0.012*"component" + 0.010*"long"'),
 (5,
  '0.280*"index" + 0.073*"feature"

This means for example that the top 10 keywords that contribute to topic 19 are: ‘system’, ‘study’, ‘google’.. and so on and the weight of ‘system’ on topic 0 is 0.082.

The weights reflect how important a keyword is to that topic.

***
**Compute Model Perplexity and Coherence Score**
***

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. 

***
**Optimal number of topics for LDA**
***

One approach to finding the optimal number of topics is to build many LDA models with different values of number of topics (k) and pick the one that gives the highest coherence value.


Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics.

If we see the same keywords being repeated in multiple topics, it’s probably a sign that the ‘k’ is too large.

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.

In [25]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -14.103290192093239


In [26]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3008924849246012


***
**Visualize the topics-keywords**
***

Now that the LDA model is built, the next step is to examine the produced topics and the associated keywords. There is no better tool than pyLDAvis package’s interactive chart and is designed to work well with jupyter notebooks.

In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

***
**Finding the dominant topic in each sentence**
***

One of the practical application of topic modeling is to determine what topic a given document is about. To find that, we find the topic number that has the highest percentage contribution in that document.

In [28]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [30]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

In [31]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

***
**Find the most representative document for each topic**
***

Sometimes just the topic keywords may not be enough to make sense of what a topic is about. So, to help with understanding the topic, you can find the documents a given topic has contributed to the most and infer the topic by reading that document. 

In [32]:
# Group top sentences under each topic
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                            grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)


In [33]:
# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

sent_topics_sorteddf.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.2634,"arbitrary, execute, code, attribute, custom, modification, vector, buffer, apple, dot","Arbitrary code execution (via backdoor code) was discovered in bootstrap-sass 3.2.0.3, when downloaded from rubygems.org. An unauthenticated attacker can craft the ___cfduid cookie value with base..."
1,1.0,0.2656,"window, new, firewall, switch, valid, desktop, null, random, frame, virtual","Microsoft Windows Vista SP2; Windows Server 2008 SP2 and R2; Windows 7 SP1; Windows 8.1, Windows Server 2012 Gold and R2, Windows RT 8.1; Windows 10 Gold, 1511, and 1607; and Windows Server 2016 l..."
2,2.0,0.258,"service, cause, denial, crash, method, consumption, python, partial, usage, buffer","BACnet Protocol Stack through 0.8.6 has a segmentation fault leading to denial of service in BACnet APDU Layer because a malformed DCC in AtomicWriteFile, AtomicReadFile and DeviceCommunicationCon..."
3,3.0,0.4851,"security, android, hijack, check, root, build, domain, write, state, free","In avrc_proc_vendor_command of avrc_api.cc, there is a possible out of bounds write due to a missing bounds check. This could lead to remote code execution with no additional execution privileges ..."
4,4.0,0.6289,"allow, remote, attacker, vulnerability, server, craft, application, attack, component, long","systemd 239 through 243 accepts any certificate signed by a trusted certificate authority for DNS Over TLS. Server Name Indication (SNI) is not sent, and there is no hostname validation with the G..."


***
**Topic distribution across documents**
***

Understand the volume and distribution of topics in order to judge how widely it was discussed.

In [34]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

df_dominant_topics.head()

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,3.0,"security, android, hijack, check, root, build, domain, write, state, free",8.0,0.0001
1.0,3.0,"security, android, hijack, check, root, build, domain, write, state, free",10.0,0.0001
2.0,3.0,"security, android, hijack, check, root, build, domain, write, state, free",31.0,0.0004
3.0,3.0,"security, android, hijack, check, root, build, domain, write, state, free",783.0,0.0089
4.0,4.0,"allow, remote, attacker, vulnerability, server, craft, application, attack, component, long",72867.0,0.8317


***
**Term-Topic Matrix**
***

In [35]:
topics = [[(term, round(wt, 3)) for term, wt in lda_model.show_topic(n, topn=20)] for n in range(0, lda_model.num_topics)]

In [36]:
topics_df = pd.DataFrame([[term for term, wt in topic] for topic in topics], columns = ['Term'+str(i) for i in range(1, 21)], index=['Topic '+str(t) for t in range(1, lda_model.num_topics+1)]).T
topics_df.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
Term1,arbitrary,window,service,security,allow,index,information,issue,user,php,script,early,possibly,multiple,file,memory,relate,directory,cve,request
Term2,execute,new,cause,android,remote,feature,html,error,note,parameter,web,read,privilege,overflow,unspecified,kernel,d,field,client,module
Term3,code,firewall,denial,hijack,attacker,operation,obtain,bug,local,sql,site,version,gain,function,action,linux,java,search,connection,bypass
Term4,attribute,switch,crash,check,vulnerability,filename,sensitive,enable,access,command,cross,impact,default,base,temporary,add,email,include,validate,authentication
Term5,custom,valid,method,root,server,administrative,admin,delete,use,injection,xss,permission,call,packet,beta,platform,lead,image,query,traversal


***
**t-SNE Clustering Chart**
***

In [37]:
# Get topic weights and dominant topics 
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 9486 samples in 0.001s...
[t-SNE] Computed neighbors for 9486 samples in 3.521s...
[t-SNE] Computed conditional probabilities for sample 1000 / 9486
[t-SNE] Computed conditional probabilities for sample 2000 / 9486
[t-SNE] Computed conditional probabilities for sample 3000 / 9486
[t-SNE] Computed conditional probabilities for sample 4000 / 9486
[t-SNE] Computed conditional probabilities for sample 5000 / 9486
[t-SNE] Computed conditional probabilities for sample 6000 / 9486
[t-SNE] Computed conditional probabilities for sample 7000 / 9486
[t-SNE] Computed conditional probabilities for sample 8000 / 9486
[t-SNE] Computed conditional probabilities for sample 9000 / 9486
[t-SNE] Computed conditional probabilities for sample 9486 / 9486
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.507507
[t-SNE] KL divergence after 1000 iterations: 1.304362


In [39]:
import seaborn as sns
import matplotlib.colors as mcolors
mycolors = np.array([color for name, color in mcolors.XKCD_COLORS.items()])

In [40]:
# Plot the Topic Clusters using Bokeh
output_notebook()

plot = figure(title="t-SNE Clustering", 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)