### Modules

In [28]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

import pickle
import requests, os, sys

from sklearn import model_selection
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix

import xgboost as xgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_seq_items', 1000)
%matplotlib inline

# Data

### Data

In [29]:
# calling bitcoin heist data
df = pd.read_csv('../../../data/external/BitcoinHeistData.csv').rename(columns = {'label':'ransomware'})

### Labelling

In [30]:
# find list of ransomwares
ransomwares = df.ransomware.unique().tolist()

ransomwares

['princetonCerber',
 'princetonLocky',
 'montrealCryptoLocker',
 'montrealCryptXXX',
 'paduaCryptoWall',
 'montrealWannaCry',
 'montrealDMALockerv3',
 'montrealCryptoTorLocker2015',
 'montrealSamSam',
 'montrealFlyper',
 'montrealNoobCrypt',
 'montrealDMALocker',
 'montrealGlobe',
 'montrealEDA2',
 'paduaKeRanger',
 'montrealVenusLocker',
 'montrealXTPLocker',
 'paduaJigsaw',
 'montrealGlobev3',
 'montrealJigSaw',
 'montrealXLockerv5.0',
 'montrealXLocker',
 'montrealRazy',
 'montrealCryptConsole',
 'montrealGlobeImposter',
 'montrealSam',
 'montrealComradeCircle',
 'montrealAPT',
 'white']

In [31]:
## CHOOSE LABELS HERE
labels = [
             'montrealCryptoLocker',
             'paduaCryptoWall'
            ]

In [32]:
# column for adding label
def add_label(row):
    if row['ransomware'] in labels:
        return 1
    else:
        return 0

In [33]:
# adding label
df['label'] = df.apply(add_label, axis=1)

In [34]:
# reordering columns
df = df[['label', 'ransomware'] + [col for col in df.columns if col not in ['label', 'ransomware']]]

In [35]:
# see the counts of each label
df.label.value_counts()

0    2894992
1      21705
Name: label, dtype: int64

In [36]:
# see the percentage of each label
df.label.value_counts()/df.label.count()*100

0    99.255836
1     0.744164
Name: label, dtype: float64

In [37]:
df.head()

Unnamed: 0,label,ransomware,address,year,day,length,weight,count,looped,neighbors,income
0,0,princetonCerber,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,100050000.0
1,0,princetonLocky,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,100000000.0
2,0,princetonCerber,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.0,1,0,2,200000000.0
3,0,princetonCerber,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,71200000.0
4,0,princetonLocky,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,200000000.0


### Splitting data

In [38]:
# selecting dates appropriate for cerber
df = df[(df['year'] >= 2013) & (df['year'] <= 2015)]

In [39]:
# sorting values by time
df = df.sort_values(by=['year', 'day']).reset_index().drop(columns=['index'])

In [40]:
df.head()

Unnamed: 0,label,ransomware,address,year,day,length,weight,count,looped,neighbors,income
0,1,montrealCryptoLocker,16cVG72goMe4sNqZhnpmnqfCMZ1uSFbUit,2013,1,0,0.5,1,0,2,65500000.0
1,1,montrealCryptoLocker,1BCRuRA9mgVuhSkLBgvVrgSP6MZ3ri9Xrt,2013,1,78,4.768372e-07,1,0,1,47569450.0
2,1,montrealCryptoLocker,1BzhCdy3TtFJoYqd6fFdYFkPLz3phzxK6Y,2013,1,38,0.0009765625,1,0,1,34000000.0
3,1,montrealCryptoLocker,1NGtvWJUuFEq2wma5YEi9n3TD7EDUj1LjU,2013,1,6,0.009844322,4,0,1,50311924.0
4,0,white,1onFsTV86TP3PqyMfRHb4idcy1AskVo6m,2013,1,90,0.03125,1217,0,2,73835034.0


In [41]:
# SET TEST SIZE HERE
test_size = 0.2
train_size = int(df.shape[0]*(1-test_size))

raw_train_df = df[df.index <= train_size]
raw_test_df = df[df.index > train_size]

print(f'Raw training data size: {raw_train_df.shape[0]}, Raw testing data size: {raw_test_df.shape[0]}')

Raw training data size: 893212, Raw testing data size: 223302


### Balancing training data

In [42]:
# see the percentage of each label 
raw_train_df.label.value_counts()/raw_train_df.label.count()*100

0    97.98133
1     2.01867
Name: label, dtype: float64

In [43]:
# isolate labelled data
label_train_df = raw_train_df[raw_train_df.label == 1]
label_train_size = label_train_df.label.count()

# isolate unlabelled data
unlabel_train_df = raw_train_df[raw_train_df.label == 0]

# sample unlabelled data to equal the size of the labelled data
reduced_unlabel_train_df = unlabel_train_df.sample(n=label_train_size, random_state=11)

# combine to make final training data
train_df = (pd.concat([label_train_df, reduced_unlabel_train_df])
                        .sort_values(by=['year', 'day'])
                        .reset_index()
                        .drop(columns=['index']))

In [44]:
# see the percentage of each label 
train_df.label.value_counts()/train_df.label.count()*100

1    50.0
0    50.0
Name: label, dtype: float64

In [45]:
train_df.shape

(36062, 11)

### Balancing test data

In [46]:
# see the percentage of each label 
raw_test_df.label.value_counts()/raw_test_df.label.count()*100

0    98.703549
1     1.296451
Name: label, dtype: float64

In [47]:
# set the percent of unlabelled data required in test data (taken from complete data set)
unlabel_test_percent = 0.9858014

In [48]:
# find the number of labelled datapoints in the test data
label_test_size = raw_test_df[raw_test_df.label == 1].shape[0]

In [49]:
# compute the amount of unlabelled data required to simulate the entire dataset
unlabelled_test_size = int((unlabel_test_percent*label_test_size)/(1-unlabel_test_percent))


In [50]:
# isolate labelled data
label_test_df = raw_test_df[raw_test_df.label == 1]

# isolate unlabelled data
unlabel_test_df = raw_test_df[raw_test_df.label == 0]

# sample unlabelled data of the correct size
reduced_unlabel_test_df = unlabel_test_df.sample(n=unlabelled_test_size, random_state=11)

# combine to make final training data
test_df = (pd.concat([label_test_df, reduced_unlabel_test_df])
                        .sort_values(by=['year', 'day'])
                        .reset_index()
                        .drop(columns=['index']))

In [51]:
# see the percentage of each label 
test_df.label.value_counts()/test_df.label.count()*100

0    98.580138
1     1.419862
Name: label, dtype: float64

In [52]:
test_df.shape

(203893, 11)

### Scale

In [53]:
X = np.array(train_df.loc[:, 'day':])
y = np.array(train_df.loc[:, 'label'])
X_test = np.array(test_df.loc[:, 'day':])
y_test = np.array(test_df.loc[:, 'label'])

In [54]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [55]:
X.shape, X_test.shape

((36062, 7), (203893, 7))

In [56]:
# finding the percent of the data used to train model
X.shape[0]*100/df.shape[0]

3.229874412680898

### Add polynomial features

In [57]:
# # adding polynomial features
# poly = PolynomialFeatures(degree=2) 


# # add degree two polynomials to budget
# X_budg = numpy.delete(poly.fit_transform(np.array(X_df[['budget']])), obj=0, axis=1) 
# X_budg_test = numpy.delete(poly.transform(np.array(X_test_df[['budget']])), obj=0, axis=1) 

# Modelling

## Logistic regression

### Cross validate a working logistic regression model

In [2]:
kf = KFold(n_splits=3, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []
f1_scores = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    lr_model = linear_model.LogisticRegression(solver="lbfgs", C=100)
    lr_model.fit(X_train, y_train)
    y_pred = (lr_model.predict_proba(X_val)[:, 1] >= 0.49)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')
print(f'F1 score: {np.mean(f1_scores):.3f} +- {np.std(f1_scores):.3f}')

NameError: name 'KFold' is not defined

### Threshold tuning

In [None]:
## ACCURACY PEAKS AT 0.49 BUT 0.4 MIGHT BE BETTER FOR THE SAKE OF RECALL
for i in np.linspace(0,1,101):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        lr_model = linear_model.LogisticRegression(solver="lbfgs", C=100)
        lr_model.fit(X_train, y_train)
        y_pred = (lr_model.predict_proba(X_val)[:, 1] >= i)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Threshold: {i:.2f}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Tuning C-value

In [None]:
## ACURACY AND RECALL IMPROVES WITH A C-VALUE OF 100 BUT RECALL IS BEST WITH A VERY LOW C-VALUE
c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for i in c_values:
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        lr_model = linear_model.LogisticRegression(solver="lbfgs", C=i)
        lr_model.fit(X_train, y_train)
        y_pred = (lr_model.predict_proba(X_val)[:, 1] >= 0.49)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'C-value: {i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Tuning solver

In [None]:
## UNINTERESTING AS USUAL
solver_choices = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

for i in solver_choices:
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        lr_model = linear_model.LogisticRegression(solver=i, C=100)
        lr_model.fit(X_train, y_train)
        y_pred = (lr_model.predict_proba(X_val)[:, 1] >= 0.49)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Solver: {i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Train and test final logistic regression model

In [None]:
lr_model = linear_model.LogisticRegression(solver="lbfgs", C=100)
lr_model.fit(X, y)
y_pred = (lr_model.predict_proba(X_test)[:, 1] >= 0.47)

In [None]:
# SHAME
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'ROC AUC: {roc_auc:.3f}')
print(f'F1 score: {f1:.3f}')

### Analyze features of final logistic regression model

In [None]:
# MOST IMPORTANT FEATURES PREDICTING REMAIN ARE PAY, MIGRANT SHARE, SCOTLAND
importances = dict(set(zip(train_df.loc[:, 'year':].columns, lr_model.coef_[0])))

importances

## Naive Bayes

### Cross validate a Gaussian naive Bayes model

In [3]:
## GAUSSIAN HAS BETTER RECALL
kf = KFold(n_splits=3, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []
f1_scores = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    nb_model = naive_bayes.GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = (nb_model.predict_proba(X_val)[:, 1] >= 0.99)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')
print(f'F1 score: {np.mean(f1_scores):.3f} +- {np.std(f1_scores):.3f}')

NameError: name 'KFold' is not defined

### Cross validate a Bernoulli naive Bayes model

In [None]:
## BERNOULLI HAS BETTER ACCURACY
kf = KFold(n_splits=3, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []
f1_scores = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    nb_model = naive_bayes.BernoulliNB()
    nb_model.fit(X_train, y_train)
    y_pred = (nb_model.predict_proba(X_val)[:, 1] >= 0.46)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')
print(f'F1 score: {np.mean(f1_scores):.3f} +- {np.std(f1_scores):.3f}')

### Threshold tuning

In [None]:
## GAUSSIAN ACCURACY IS RUBBISH UNTIL ABOVE 0.8 BUT RECALL IS ALWAYS GOOD
## BERNOULLI ACCURACY PEAKS AT 0.46 BUT RECALL IS BETTER AROUND 0.3
for i in np.linspace(0,1,101):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        nb_model = naive_bayes.BernoulliNB()
        nb_model.fit(X_train, y_train)
        y_pred = (nb_model.predict_proba(X_val)[:, 1] >= i)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Threshold: {i:.2f}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

## Random forests

### Cross validate a working random forest model

In [58]:
kf = KFold(n_splits=10, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []
f1_scores = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    rf_model = ensemble.RandomForestClassifier(n_estimators=11, 
                                               random_state=11, 
                                               criterion='entropy', 
                                               max_depth=18, 
                                               min_samples_split=15, 
                                               max_features=1,
                                               max_leaf_nodes=1000,
                                               min_samples_leaf=11,
                                               max_samples=0.45)
    
    rf_model.fit(X_train, y_train)
    y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.44)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')
print(f'F1 score: {np.mean(f1_scores):.3f} +- {np.std(f1_scores):.3f}')

Accuracy: 0.736 +- 0.008
Precision: 0.692 +- 0.009
Recall: 0.849 +- 0.010
ROC AUC: 0.736 +- 0.008
F1 score: 0.763 +- 0.008


### Threshold tuning

In [None]:
## CLEAR WINNER IS 0.41 WITH BEST ACCURACY AND STILL HIGH RECALL
## WITH 25 ESTIMATORS ACCURACY IMPROVES NEARER TO 0.5 BUT RECALL DECREASES THERE, 0.45 IS A COMPROMISE
## WITH 25 ESTIMATORS AND ENTROPY CRITERION BEST IS ACTUALLY 0.49
for i in np.linspace(0.3,0.6,31):
    kf = KFold(n_splits=3, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, 
                                               random_state=11, 
                                               criterion='entropy')
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= i)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Threshold: {i:.2f}, Accuracy: {mean_accuracy:.3f}, Recall: {mean_recall:.3f}')

### Number of estimators tuning

In [None]:
## BEST SEEMS TO BE 25
for i in range(1,30):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=i, random_state=11, criterion='entropy')
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Number of estimators: {i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Entropy criterion

In [None]:
## ENTROPY CRITERION SLIGHTLY IMPROVES ACCURACY AND RECALL
kf = KFold(n_splits=5, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []
f1_scores = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    rf_model = ensemble.RandomForestClassifier(n_estimators=25, random_state=11, criterion='entropy')
    rf_model.fit(X_train, y_train)
    y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')
print(f'F1 score: {np.mean(f1_scores):.3f} +- {np.std(f1_scores):.3f}')

### Max depth tuning

In [None]:
## BEST SEEMS TO BE 27
for i in range(2,25):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, random_state=11, criterion='entropy', max_depth=i)
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Max depth: {i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Min samples split tuning

In [None]:
## BEST SEEMS TO BE 5
for i in range(2,20):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, 
                                                   random_state=11, 
                                                   criterion='entropy', 
                                                   max_depth=27,
                                                   min_samples_split=i)
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Min samples split: {i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Max leaf nodes tuning

In [None]:
## ACCURACY CANT BE IMPROVED WITH MAX LEAF NODES BUT RECALL IS AWESOME AT MAX LEAF NODES OF 2
for i in range(1,20):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, 
                                                   random_state=11, 
                                                   criterion='entropy', 
                                                   max_depth=27,
                                                   min_samples_split=5, 
                                                   max_leaf_nodes=2**i)
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Max leaf nodes: {2**i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Min samples leaf tuning

In [None]:
## DOESN'T SEEM TO HELP ACCURACY, BUT IMPROVES RECALL SLIGHTLY AT LOW NUMBERS
for i in range(1,20):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, 
                                                   random_state=11, 
                                                   criterion='entropy', 
                                                   max_depth=27,
                                                   min_samples_split=5, 
                                                   min_samples_leaf=i)
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Min samples leaf: {i}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Max samples tuning

In [None]:
## DOESN'T SEEM TO BE HELPFUL
for i in np.linspace(0.01,0.99,30):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, 
                                                   random_state=11, 
                                                   criterion='entropy', 
                                                   max_depth=27,
                                                   min_samples_split=5, 
                                                   max_samples=i)
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Max samples (pct of total): {i*100:.1f}, Accuracy: {mean_accuracy:.3f}, Recall: {mean_recall:.3f}')

### Max features tuning

In [None]:
## ACCURACY PEAKS AT 6, RECALL IS BEST AT LOWER FEATURES
for i in range(1,9):
    kf = KFold(n_splits=5, shuffle=True, random_state = 11)
    accuracies = [] 
    recalls = [] 
    
    for train_ind, val_ind in kf.split(X,y):
        
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind] 
        
        rf_model = ensemble.RandomForestClassifier(n_estimators=25, 
                                                   random_state=11, 
                                                   criterion='entropy', 
                                                   max_depth=27,
                                                   min_samples_split=5, 
                                                   max_features=i)
        rf_model.fit(X_train, y_train)
        y_pred = (rf_model.predict_proba(X_val)[:, 1] >= 0.45)

        accuracies.append(accuracy_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
    
    mean_accuracy = np.mean(accuracies)
    mean_recall = np.mean(recalls)
    
    print(f'Max features: {i:.0f}, Accuracy: {mean_accuracy:.5f}, Recall: {mean_recall:.5f}')

### Train and test final random forest model

In [71]:
# train best random forest model
best_rf_model = ensemble.RandomForestClassifier(n_estimators=11, 
                                               random_state=11, 
                                               criterion='entropy', 
                                               max_depth=18, 
                                               min_samples_split=15, 
                                               max_features=1,
                                               max_leaf_nodes=1000,
                                               min_samples_leaf=11,
                                               max_samples=0.45)
best_rf_model.fit(X_train, y_train)
y_pred = (best_rf_model.predict_proba(X_test)[:, 1] >= 0.5)

In [72]:
# see what percent were predicted to be ransomware
np.sum(y_pred)/y_pred.shape[0]

0.2852182272074078

In [73]:
# create a confusion matrix
confusion_matrix(y_test, y_pred)

array([[144563,  56435],
       [  1176,   1719]])

In [74]:
# test the final model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'ROC AUC: {roc_auc:.3f}')
print(f'F1 score: {f1:.3f}')

Accuracy: 0.717
Precision: 0.030
Recall: 0.594
ROC AUC: 0.657
F1 score: 0.056


### Analyze features of final random forest model

In [None]:
# MOST IMPORTANT FEATURES PREDICTING REMAIN ARE PAY, MIGRANT SHARE, SCOTLAND
importances = dict(set(zip(train_df.loc[:, 'day':].columns, best_rf_model.feature_importances_)))

importances

## Gradient boosted trees

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state = 11)
accuracies = [] 
precisions = [] 
recalls = [] 
roc_aucs = []
f1_scores = []

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    gb_model = ensemble.GradientBoostingClassifier()
    
    gb_model.fit(X_train, y_train)
    y_pred = (gb_model.predict_proba(X_val)[:, 1] >= 0.5)
    
    accuracies.append(accuracy_score(y_val, y_pred))
    precisions.append(precision_score(y_val, y_pred))
    recalls.append(recall_score(y_val, y_pred))
    roc_aucs.append(roc_auc_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

print(f'Accuracy: {np.mean(accuracies):.3f} +- {np.std(accuracies):.3f}')
print(f'Precision: {np.mean(precisions):.3f} +- {np.std(precisions):.3f}')
print(f'Recall: {np.mean(recalls):.3f} +- {np.std(recalls):.3f}')
print(f'ROC AUC: {np.mean(roc_aucs):.3f} +- {np.std(roc_aucs):.3f}')
print(f'F1 score: {np.mean(f1_scores):.3f} +- {np.std(f1_scores):.3f}')