# Classifier - Unbalanced

Having the features selected, we can now obtain a model to classify the samples. We'll start with an unbalanced dataset.

In [1]:
from IPython.display import display
import pandas as pd
import numpy as np
import math
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from scipy import interp

data_folder = '../data/'
final_dataset_file = data_folder + 'dataset_v1.csv.gz'
selected_imports_file = data_folder + 'selected_imports.csv.gz'

cv_token_pattern = u'[^;]+'
vec_stop_words = ['*invalid*']
# Remove imports' extension
def token_preprocessor(s):
    return re.split('\..{0,3}', s)[0]

In [2]:
# Load dataset
dataset = pd.read_csv(final_dataset_file)
dataset = dataset.set_index('link')

# Load the selected features
features = pd.read_csv(selected_imports_file)
features = features['0'].values

## Create CountVectorizer

In [13]:
cv = CountVectorizer(token_pattern=cv_token_pattern, stop_words=vec_stop_words,
                     preprocessor=token_preprocessor, vocabulary=features)
cv.fit(dataset.dlls)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function token_preprocessor at 0x7f5038f04bf8>,
        stop_words=['*invalid*'], strip_accents=None,
        token_pattern='[^;]+', tokenizer=None,
        vocabulary=array(['steam_api', 'pshed', ..., 'vcl60', 'version'], dtype=object))

In [20]:
# %%time
# classifier = svm.SVC(probability=True, class_weight='balanced', cache_size=1000)
# probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
# fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
# display(fpr, tpr, thresholds)

In [22]:
# %%time
# Trying ROC
skf = StratifiedKFold(n_splits=10)
# classifier =  LogisticRegression()
classifier = svm.SVC(probability=True, class_weight='balanced', cache_size=2000)
# classifier =  SGDClassifier(max_iter=1000, n_jobs=-1, loss='modified_huber', class_weight='balanced')

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
X = cv.transform(dataset.dlls)
y = dataset.malware
plt.figure(figsize=(10, 4), dpi=100)

i = 0
for train, test in skf.split(X, y):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig('roc.png')
plt.show()

NameError: name 'class_weight' is not defined