In [2]:
# Code Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import sqlite3
from sqlite3 import Error
#from sklearn.linear_model import LogisticRegression     # Logistic Regression
from sklearn.neighbors import KNeighborsClassifier      # k-Nearest Neighbours
from sklearn.preprocessing import LabelEncoder          # encooding variables
from sklearn.preprocessing import StandardScaler        # encooding variables
from sklearn.model_selection import train_test_split    # testing our models
#from sklearn.preprocessing import OneHotEncoder         # nominal variable
#from sklearn.metrics import confusion_matrix            # scoring
#from sklearn.tree import DecisionTreeClassifier         # decision trees
#from sklearn.tree import DecisionTreeRegressor          # decision trees
#from sklearn import tree                                # decision trees
from sklearn.decomposition import PCA                   # PCA 
from sklearn.cluster import KMeans                      # KMeans Clustering

In [1]:
# load data into a dataframe
cvss_data = pd.read_csv('./data/cvss/CVSS_data_complete.csv')

# drop unneeded columns that are not useful for analysis
cvss_data = cvss_data.drop(columns=['cve_id','assigner','description','cwe_ids','refs','ref_names','ref_sources','ref_tags','v3_baseScore','v3_exploitabilityScore','v3_impactScore','v3_baseSeverity'])

# Checking for empty values
print(cvss_data.isnull().sum())

# check that columns don't have unusual values
for col in cvss_data.columns:
    print(col, ":", cvss_data[col].unique())

NameError: name 'pd' is not defined

In [None]:
# encode all columns
le = LabelEncoder()
for col in cvss_data.columns:
    cvss_data[col] = le.fit_transform(cvss_data[col])
    
    
# decompose
pca = PCA(n_components=2)
cvss_reduced = pca.fit_transform(cvss_data)

# cluster
kmc = KMeans(n_clusters=6, random_state=0)
kmc_model = kmc.fit(cvss_reduced)


# plotting
colors=["red","blue","green","purple","orange","brown","black"]
plt.figure(figsize=(12,8))
for i in range(np.max(kmc_model.labels_)+1):
    plt.scatter(cvss_reduced[kmc_model.labels_==i][:,0], cvss_reduced[kmc_model.labels_==i][:,1], label=i, c=colors[i], alpha=0.5)
plt.scatter(kmc_model.cluster_centers_[:,0], kmc_model.cluster_centers_[:,1], label='Cluster Centers', c="black", s=200)
plt.title("K-Means Clustering of CVSS Data",size=20)
plt.xlabel("Principle Component 1", size=16)
plt.ylabel("Principle Component 2", size=16)
plt.legend()
plt.show()

I started by using a label encoder to assign numbers to all the columns, since all the remanining columns were described by words. I then reduced the data down to 2 dimensions to allow myself to look at the data from the best angle and easily decide the number of clusters needed.
I then used K Means clustering to cluster the data. I first tried using 5 clusters as my hyperparameter, but found that this leaves the bottom left data in the graph being divided between the clusters on its right and left, which are quite far away fropm it, so I chose 6 clusters. Using 7 clusters unnecessarily divided up the area on the bottom right of the graph into 3 clusters, which I did not think was accurate.

In [None]:
# create a dataframe containing the data points of each cluster
k = np.max(kmc_model.labels_)+1
df_clusters = [cvss_data[kmc_model.labels_==i] for i in range(k)]

# reset the cvss_data dataframe to include all columns
cvss_data = pd.read_csv('./data/cvss/CVSS_data_complete.csv')

# print the mean base score for each cluster
inc = 1
for i in range(np.max(kmc_model.labels_)+1):
    mean = np.mean(cvss_data[kmc_model.labels_==i]['v3_baseScore'])
    print("cluster number", inc, "has mean Base Score", np.asarray(mean), "\n")
    inc = inc + 1

le = LabelEncoder()
cvss_data['v3_baseSeverity'] = le.fit_transform(cvss_data['v3_baseSeverity'])    
    
    
k = np.max(kmc_model.labels_)+1
df_clusters = [cvss_data[kmc_model.labels_==i] for i in range(k)]

stat_dict = { 
    'Cluster' : map(lambda x:x+1, list(range(k))),
    'Size' :    [len(df_clusters[i]) for i in range(k)],
    'Mean Exploitability Score' :            [round(df_clusters[i]['v3_exploitabilityScore'].mean(), 2) for i in range(k)],
    'Mean Impact Score' :                    [round(df_clusters[i]['v3_impactScore'].mean(), 2) for i in range(k)],
    'Mean Base Severity' :                   [round(df_clusters[i]['v3_baseSeverity'].mean(), 2) for i in range(k)],
    'Std Dev Base Severity' :                [round(df_clusters[i]['v3_baseSeverity'].std(), 2) for i in range(k)],
    'Mean Base Score' :                      [round(df_clusters[i]['v3_baseScore'].mean(), 2) for i in range(k)],
    'Std Dev Base Score' :                   [round(df_clusters[i]['v3_baseScore'].std(), 2) for i in range(k)],
}
df_cluster_stats = pd.DataFrame(stat_dict)
df_cluster_stats

Comparing the mean base score for the clusters shows that they are roughly divided into 2 categories, with clusters 1,3 and 5 in one category, all having a low mean base score, and clusters 2,4 and 6 in the other with high values. This trend is further shown in the value for mean base severity, where clusters 1,3 and 5 have high values, while clusters 2,4 and 6 have low ones. The largest differences between clusters 1,3 and 5 however lie in mean impact score, with all 3 having quite different values. Clusters 2,4 and 6 also differ widely on impact score. Cluster 1 is also notable for having quite anomalistic values for some columns, such as mean exploitability score, mean impact score and base severity standard deviation.

In [None]:
fig = plt.figure(figsize=(11,7))
ax = fig.add_subplot(111)

to_plot = []
for i in range(np.max(kmc_model.labels_)+1):
    to_plot.append(cvss_data[kmc_model.labels_==i]['v3_baseScore'])

# boxplot
plt.boxplot(to_plot)

# category labels, overall labels and title
plt.xticks([1,2,3,4,5,6],["Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5", "Cluster 6"])
ax.set_xlabel("Cluster")
ax.set_ylabel("Base Score")
ax.set_title("Distribution of Base Score throughout the clusters")

plt.show()

Clearly cluster 1 has a huge amount of variation, which is supported by the std dev of 0.57, which is much lower than all the other clusters.

As said before, the base score is clearly divided into two groups, so one would assume one group is the "threat" one, containing clusters 2,4 and 6 while the other is the less threatening group, containing clusters 1,3 and 5. The difference between these two groups seems significant and noteworthy.

BITCOINHEIST

In [None]:
# load data into a dataframe
bitcoin = pd.read_csv('./data/BitcoinHeistDataSample.csv')

# drop unneeded columns
bitcoin = bitcoin.drop(columns=['address', 'day'])

# Checking for empty values
print(bitcoin.isnull().sum())

# checking that certain columns don't have values that don't make sense
wrong_value_check = ['year', 'income']

# print the unique options for each column
for col in wrong_value_check:
    print(col , "column has unique values" , np.sort(bitcoin[col].unique()))

In [None]:
bit_years = np.sort(bitcoin['year'].unique())

r_counts = []
w_counts = []
for year in bit_years:
    year_rows = bitcoin[bitcoin.loc[:,"year"]==year]
    r_count = len(year_rows[year_rows.loc[:,"label"]!="white"])
    r_counts.append(r_count)
    w_count = len(year_rows[year_rows.loc[:,"label"]=="white"])
    w_counts.append(w_count)
    

fig = plt.figure(figsize=(16,8))
ax = fig.add_subplot(111)

colors = cm.plasma(np.arange(2)/2.)
width = 0.5

      
for idx in range(8):
    plt.bar((idx*3), r_counts[idx], 0.35, color=colors[0])
    plt.bar((idx*3)+0.4, w_counts[idx], 0.35, color=colors[1])


# add descriptions for each bar in the graph, as well as indexes and a title
plt.xticks([0.2,3.2,6.2,9.2,12.2,15.2,18.2, 21.2], ["2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018"])
ax.set_xlabel("Year", fontsize=15)
ax.set_ylabel("Volume", fontsize=15)
ax.set_title("Comparison of the Volume of White and Ransom records throughout the years", fontsize=20)
ax.legend(('ransom', 'white'), fontsize=13)

plt.show()

In [None]:
# scale and split data with 64% for training, 20% for testing and 16% for validation
def data_scale_split(data):
    
    X = data[['year', 'length', 'weight', 'count', 'looped', 'neighbors', 'income']]
    y = data['label']
    
    train_x, test_x, train_y, test_y = train_test_split(X,y,test_size = 0.2,random_state=2420)
    
    ss = StandardScaler()
    ss_model = ss.fit(train_x)
    train_x_scaled = ss_model.transform(train_x)
    test_x_scaled = ss_model.transform(test_x)    
    
    train_x_scaled, val_x_scaled, train_y, val_y = train_test_split(train_x_scaled,train_y,test_size = 0.2)
    return train_x_scaled, train_y, test_x_scaled, test_y, val_x_scaled, val_y

train_x_scaled, train_y, test_x_scaled, test_y, val_x_scaled, val_y = data_scale_split(bitcoin)

# test what the best parameter for k is
best_k = -1
best_score = -1
for k in [3,5,7,9,11,13,15]:
    knn = KNeighborsClassifier(n_neighbors=k)    # just change the n_neighbors parameter
    
    knn_model = knn.fit(train_x_scaled, train_y) # scaled X, un-scaled y
    
    train_score = knn.score(train_x_scaled, train_y)
    
    val_score = knn.score(val_x_scaled, val_y)
    
    print(k, "Training Score:", train_score, "Validation Score: ", val_score)
    
    # find the best k
    if best_score <= val_score:
        best_score = val_score
        best_model = knn_model
        besk_k = k

print(f'The best k is {besk_k} and the best val score is {best_score:.4f}')


# evalate the best model using the test set
print("Best Model Test Score: {:.4f}".format(best_model.score(test_x_scaled, test_y)))

I prepared the data by splitting it into training, test and validation. I then used a wide range of values to test what the best value for k is, to use it as a parameter using the validation set

hyperparameters:
k=9 as justified above
training = 64, test = 20, val = 16