In [50]:
import numpy as np
import pandas as pd 
import altair as alt
import scipy.optimize as opt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import timedelta
import scipy.stats as sps
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [51]:
# Read in data and format date
cve = pd.read_csv('cve.csv', header=0, index_col=0)
products = pd.read_csv('products.csv', header=0, index_col=0)
vendors = pd.read_csv('vendors.csv', header=0, index_col=0)

cve.pub_date = pd.to_datetime(cve.pub_date)

## **Purpose of the Project and Review of Previous Datasets**

Our goal is to develop a classifier that predicts threat severity in the coming years based on historical vulnerability data. The dynamic figure below shows historical trends in threats based on data CSV data from 1999 to 2019. This data shows that threats have always been on an upward trend, but the percentage of threats leveled off in recent years. With zero day threats at their highest levels this year, we intend to use machine learning to evaluate the rate and identify trends. **In our review of previous reports on CVE datasets we have identified informative text-based data which we intend to parse and engineer features with.**

In [52]:
X = cve.pub_date.dt.to_period('Q').sort_index().value_counts()
X.index = X.index.to_timestamp()
X = X.sort_index()
X[X.index] = np.cumsum(X.values)

rolling = cve.pub_date.dt.to_period('M').sort_index().value_counts()
rolling.index = rolling.index.to_timestamp()
rolling = rolling.sort_index()
rolling[rolling.index] = np.cumsum(rolling.values)
rolling = rolling.rolling(12, axis=0).sum().pct_change(axis=0)

# 2 Plots
# Left: cumulative, right rolling avg. growth rate
fig = make_subplots(rows=1,cols=2)

# Trace 1: Grey bars
fig.add_trace(
    go.Bar(
        x=X.index[:-20],
        y=X.values[:-20],
        marker_color="#bbbbbb",
    ),
    row=1,
    col=1
)

# Trace 2: Red (emph) bars
fig.add_trace(
    go.Bar(
        x=X.index[-20:],
        y=X.values[-20:],
        marker_color="#ff0000"
    ), 
    row=1,
    col=1
)

# Trace 3: Growth rate
fig.add_trace(
    go.Scatter(
        x=rolling.index,
        y=rolling.values,
        marker_color="#bbbbbb",
        mode="lines"
    ),
    row=1,
    col=2
)

fig.update_layout(
    title=dict(
        text="Threat Proliferation",
        xref="paper",
        x=0., y=1.
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False
    ),
    yaxis=dict(
        showgrid=False
    ),
    annotations = [
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.2,
            showarrow=False,
            text ='The number of known threats (left) continues to grow, but ' + \
            'growth (right), measured as a perent change <br>on a 12-month rolling average, has leveled.',
            valign='top',
            align='left'
        ),
        dict(
            ax=-80,
            ay=-100,
            x=X.index[-20],
            y=40000,
            text='A majority of the threats occur after 2015'
        )
    ],
    showlegend=False,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=0
)

fig.show()

# **Overview of Data**
This dataset is derived from three sources (1) cve.csv, (2) products.csv, and (3) vendors.csv


1.   **The CVE dataset contains two major categories**
  - common vulnerability scoring system information (CVSS) - a score of severity along with the factors influencing the calculation from a group called FIRST (more details below)
  - CWE information - threat category name, code, and information
2.   **The Products dataset contains information about the products affected by the CVE**
3. **The Vendors dataset contains information about the vendors affected by the CVE**




---
All three data sources are related to Common Vulnerabilities and Exposures (CVE)s which are a list of computer security threats in software or hardware components such as firmware maintained by the US Department of Homeland Security and is maintained by the MITRE corporation.
- A **vulnerability** is a weakness which could be exploited to negatively affect the core security principles (Confidentiality, Ingegrity, or Availability).
- An **exposure** is a mistake in the software that allows access to information or capabilities that can be used by a hacker as a stepping-stone into a system or network. The following are characteristics of an exposure: 
  - A mistake that doesn't directly allow compromise
  - A violation of a reasonable security policy
  - Could be an important component of a successful attack


### **CVE File dataset characteristics**

---

Here we show the basic characteristics of the cve.csv. This includes the time period covered in the data, which is from January 1, 1999 to November 21, 2019.

In [53]:
# Description of cve dataset
cve.describe()

Unnamed: 0,cvss,cwe_code
count,89660.0,89660.0
mean,6.021429,199.690854
std,1.994757,176.177244
min,0.0,1.0
25%,4.3,79.0
50%,5.8,119.0
75%,7.5,284.0
max,10.0,1188.0


In [54]:
# Basic info for each column of the dataset
cve.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89660 entries, CVE-2019-16548 to CVE-2007-3004
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   mod_date                89660 non-null  object        
 1   pub_date                89660 non-null  datetime64[ns]
 2   cvss                    89660 non-null  float64       
 3   cwe_code                89660 non-null  int64         
 4   cwe_name                89660 non-null  object        
 5   summary                 89660 non-null  object        
 6   access_authentication   88776 non-null  object        
 7   access_complexity       88776 non-null  object        
 8   access_vector           88776 non-null  object        
 9   impact_availability     88776 non-null  object        
 10  impact_confidentiality  88776 non-null  object        
 11  impact_integrity        88776 non-null  object        
dtypes: datetime64[ns](1), float64(

In [55]:
# earliest date
min_date = min(cve.pub_date)
min_date


Timestamp('1999-01-01 05:00:00')

In [56]:
# latest date
max_date = max(cve.pub_date)
max_date


Timestamp('2019-11-21 15:15:00')

### **Products File dataset characteristics**
Here we show the basic characteristics of the products.csv file.

In [8]:
products.describe()

Unnamed: 0,vulnerable_product
count,180543
unique,40553
top,debian_linux
freq,2972


In [9]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 180585 entries, CVE-2019-16548 to CVE-2007-3004
Data columns (total 1 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   vulnerable_product  180543 non-null  object
dtypes: object(1)
memory usage: 2.8+ MB


### **Vendors File dataset characteristics**
Here we show the basic characteristics of the products.csv file.

In [10]:
vendors.describe()

Unnamed: 0,vendor
count,101616
unique,16175
top,microsoft
freq,5037


In [11]:
vendors.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101658 entries, CVE-2019-16548 to CVE-2007-3004
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   vendor  101616 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


### **Common Vulnerability Scoring System (CVSS)**
This is a measure of how severe the threat is.
- The CVSS is an open framework that describes the characteristics and severity of computer exploits. The scores are based on how easy the vulnerabilities are to exploit and also how quickly they can be exploited (temporal) along with environmental factors (Confidentiality, Integrity, Availability, and Base methods)
- The figure below shows a histogram of the CVSS over time 

In [12]:
fig = go.Figure()
X = cve.cvss.sort_values().astype('int').value_counts().sort_index()[1:]

# Three traces
fig.add_trace(
    go.Bar(
        x=X.index.map(lambda x: "{}-{}".format(x-1,x)),
        y=X.values/np.sum(X.values)*100,
        marker_color=['#bbbbbb', '#bbbbbb', '#bbbbbb', '#dc322f', '#dc322f', '#dc322f', '#dc322f', '#bbbbbb', '#bbbbbb', '#bbbbbb'],
        text=np.vectorize(lambda x: str(x) + "%")(np.round((X.values/np.sum(X.values) * 100),1)),
        textposition='outside'
))

fig.update_layout(
    title=dict(
        text="Threat Severity Histogram Distribution",
        xref="paper",
        x=0., y=1.
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False,
    ),
    yaxis=dict(
        showgrid=False,
        showticklabels=False
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.2,
            showarrow=False,
            text ="CVSS scores reflect a threat's severity. Over 75 percent of scores fall in FIRSTs Medium (4.0-6.9) threat category<br>" +
            "Scores range from 0 to 10, shown on X axis.<br>"+
             "The Percentage is shown on the Y axis. <br>"+
             "Place your mouse over the bins to see the average score for each bin.",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=0
)

fig.show()

## **Threats Evolving Over Time**
The figure below shows trends in the data regarding the 10 most common threats as measured by the CVSS scoring system. In our project we will evaluate if this trend is continuing.

In [13]:
fig = go.Figure()
X = pd.get_dummies(
        cve.cwe_name[cve.cwe_name.isin(cve.cwe_name.value_counts()[:10].index)]
    ).join(
        cve.pub_date
    ).groupby(
        cve.pub_date.dt.to_period("D")
).sum()

colors = ['#ababab', '#cb4b16', '#268bd2', '#ebebeb', '#2aa198', '#dc322f', '#bbbbbb', '#9b9b9b', '#cbcbcb', '#dbdbdb']
X.index = X.index.to_timestamp()
X = X.divide(X.sum(axis=1), axis=0)

# One trace for each column
for ndx in range(X.shape[1]):
    data = X.iloc[:,ndx].rolling(365,axis=0).mean() 
    fig.add_trace(go.Scatter(
        x=data.index[365:],
        y=data.values[365:], 
        name=X.columns[ndx] if "'" not in X.columns[ndx] else X.columns[ndx].split("'")[1],
        marker_color=colors[ndx]
    ))

fig.update_layout(
    title=dict(
        text="How threats have changed over time",
        xref="paper",
        x=0., y=1.
    ),
    height=1100,
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False,
    ),
    yaxis=dict(
        showgrid=False,
        showticklabels=False
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.075,
            showarrow=False,
            text ="These threats are the 10 most common, but their relative prominence is shifting," +
            "injection (both code and SQL)<br> is becoming less common while cross-site scripting and input validation are on the rise. " +
            "Values are shown as <br> a 365-entry rolling average of relative frequencies.",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    legend=dict(x=0., y=1.)
)

fig.show()

## **The Vulnerability of Products Over Time**
In this previous review, operating systems and web browsers were overwhelmingly the most affected by vulnerabilities. We will see if this trend continues.

In [14]:
X = products.vulnerable_product.value_counts()[25:0:-1]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=np.vectorize(lambda x: " ".join(map(lambda x: x.title() if len(x) > 2 else x.upper(), x.split("_"))))(X.index),
    x=X.values,
    orientation='h',
    marker_color= ["#268bd2"] * 3 + ["#bbbbbb"] + ["#268bd2"] * 5 + ["#859900"] + ["#268bd2"] + ["#859900"] + ["#268bd2"] * 2 + ["#bbbbbb"] * 2 + ["#268bd2"] * 2 + ["#859900"] * 2 + ["#268bd2"] * 5
))

fig.update_layout(
    height=800,
    title=dict(
        xref='paper',
        text="Affected Products",
        x=0, y=.965
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False
    ),
    yaxis=dict(
        showgrid=False,
        tickmode="linear"
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.075,
            showarrow=False,
            text="Most of the top 25 affected products are operating systems (blue) or web browsers (green)",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=.2
)

fig.show()

## **The Vulnerability of Vendors Over Time**
This previous EDA showed that a large number of the total vulnerabilities were found in the top 25 vendors. We will see if this trend continues.

In [15]:
X = vendors.vendor.value_counts()[25:0:-1]

fig = go.Figure()
fig.add_trace(go.Bar(
    y=np.vectorize(lambda x: " ".join(map(lambda x: x.title() if len(x) > 3 else x.upper(), x.split("_"))))(X.index),
    x=X.values,
    orientation='h',
    marker_color= "#bbbbbb"
))

fig.update_layout(
    height=800,
    title=dict(
        xref='paper',
        text="Affected Vendors",
        x=0, y=.965
    ),
    font=dict(
        family="Arial",
        size=14,
        color="#586e75"
    ),
    xaxis=dict(
        showgrid=False
    ),
    yaxis=dict(
        showgrid=False,
        tickmode="linear"
    ),
    annotations=[
        dict(
            xref='paper',
            yref='paper',
            x=0., y=1.075,
            showarrow=False,
            text="40% of the products affected by any vulnerability are distributed by these top 25 vendors",
            valign='top',
            align='left'
        ),
    ],
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    bargap=.2
)

fig.show()

### **Altair based Visualization**
We plan to utilize the Altair visualization library to demonstrate advanced visualizations of the data in our final result. The graph below displays the correlation between the cvss score and the cwe code, which is pretty basic, but as we perform further feature engineering this visualization will become more informative and robust.

In [26]:
cor_data = (cve
              .corr().stack()
              .reset_index()
              .rename(columns={0: 'correlation', 'level_0': 'variable', 'level_1': 'variable2'}))
cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)
cor_data.head(10)

Unnamed: 0,variable,variable2,correlation,correlation_label
0,cvss,cvss,1.0,1.0
1,cvss,cwe_code,0.008877,0.01
2,cwe_code,cvss,0.008877,0.01
3,cwe_code,cwe_code,1.0,1.0


In [48]:
base = alt.Chart(cor_data).encode(
    x='variable2:O',
    y='variable:O',
)
cor_plot = base.mark_rect().encode(
    color='correlation:Q',
    tooltip='variable:O'
).properties(
    width=500,
    height=500,
    title="Correlation of features"
)
cor_plot

In [28]:
# This should be taken out before turning in our baseline but will be used in our final result as we create more expressive features.
def compute_2d_histogram(var1, var2, df, density=True):
  H, xedges, yedges = np.histogram2d(df[var1], df[var2], density=density)
  H[H == 0] = np.nan

  xedges = pd.Series(['{0:.4g}'.format(num) for num in xedges])
  xedges = pd.DataFrame({"a": xedges.shift(), "b": xedges}).dropna().agg(' - '.join, axis=1)
  yedges = pd.Series(['{0:.4g}'.format(num) for num in yedges])
  yedges = pd.DataFrame({"a": yedges.shift(), "b": yedges}).dropna().agg(' - '.join, axis=1)

  res = pd.DataFrame(H,
                     index=yedges,
                     columns=xedges).reset_index().melt(
                         id_vars='index'
                     ).rename(columns={'index': 'value2',
                                       'value': 'count',
                                       'variable': 'value'})
  res['raw_left_value'] = res['value'].str.split(' - ').map(lambda x: x[0]).astype(float)
  res['raw_left_value2'] = res['value2'].str.split(' - ').map(lambda x: x[0]).astype(float)
  res['variable'] = var1
  res['variable'] = var2
  return res.dropa()

In [47]:
#value_columns = (cve.drop(columns=['access_vector', 'access_complexity', 'access_vector', 'impact_availability', 'impact_confidentiality', 'impact_integrity'])) - these columns cause a value error because we cannot mask with non-bollean array containing NA/NaN values
#cve_data = cve.columns.drop(['access_vector', 'access_complexity', 'access_vector', 'impact_availability', 'impact_confidentiality', 'impact_integrity'])
cve_data_2dbinned = pd.concat([compute_2d_histogram(var1, var2, cor_data) for var1 in cve for var2 in value_columns])
#cve_data_2dbinned = np.array(cve_data_2dbinned, dtype==None)
cve_data_2dbinned.head(10)

ValueError: ignored

## **Targeted Data for Cleaning and Feature Engineering**
The CWE_Name and Summary columns in the CVE dataset is rich in text which can be parsed and engineered into features. We belive these features will be more descriptive and therefore will allow us to train a classifier which will be a better predictor of vulnerability severity. The table below shows a subset of the text included in these columns of the dataset.

In [62]:
data_to_be_engineered = cve[["cwe_name", "summary"]]
# data_to_be_engineered = cve["summary"]
data_to_be_engineered.head()

Unnamed: 0,cwe_name,summary
CVE-2019-16548,Cross-Site Request Forgery (CSRF),A cross-site request forgery vulnerability in ...
CVE-2019-16547,Incorrect Permission Assignment for Critical ...,Missing permission checks in various API endpo...
CVE-2019-16546,Authorization Bypass Through User-Controlled Key,Jenkins Google Compute Engine Plugin 4.1.1 and...
CVE-2013-2092,Improper Neutralization of Input During Web P...,Cross-site Scripting (XSS) in Dolibarr ERP/CRM...
CVE-2013-2091,Improper Neutralization of Special Elements u...,SQL injection vulnerability in Dolibarr ERP/CR...
