Introducing SMOTE based on
https://www.kaggle.com/qianchao/smote-with-imbalance-data/notebook

In [1]:
import data_helpers
import os
import pandas as pd

load_main = True

main_cats = ['ActionName', 'Capability']
dataframes = {cat : {} for cat in main_cats} if load_main else {}

for cat in main_cats:
    path = os.getcwd() + "/Dataframes/" + cat + "/Processed/"
    ls = os.listdir(path)
    for file in ls:
        if (load_main and cat in file) or (not load_main and cat not in file):
            df = pd.read_csv(path + file, encoding='utf-8')

            df.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
            df.drop(["a"], axis=1, inplace=True)

            df.rename({"Unnamed: 0.1":"a"}, axis="columns", inplace=True)
            df.drop(["a"], axis=1, inplace=True)

            df_type = file.split('-')[1]
            if load_main:
                dataframes[cat][df_type] = df
            else:
                sub_cat = file.split('-')[0]
                if sub_cat not in dataframes.keys():
                    dataframes[sub_cat] = {}
                dataframes[sub_cat][df_type] = df

if not load_main:
    main_cats = list(dataframes.keys())

print("\tLoaded data!\n")
if load_main:
    print(list(dataframes['ActionName']['Train'].columns))
else:
    print(list(dataframes['File']['Train'].columns))


	Loaded data!

['label', 'text-rel', 'label_num', 'tokens', 'text-neigh', 'token', 'text-rel-processed', 'text-neigh-processed', 'text-rel-tokens', 'text-neigh-tokens']


In [2]:
import numpy as np

def load_embeddings(path):
    mapping = dict()
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            splitted = line.split(" ")
            if len(splitted) <= 2:
                continue
            mapping[splitted[0]] = np.array(splitted[1:], dtype=float) # stwórz słownik słowo -> wektor 
    return mapping

mapping = load_embeddings('Embedding-Models/super-model.txt') 
print("\t Loaded embeddings model!")

	 Loaded embeddings model!


In [51]:
'''
Calculate simple average embeddings for given sentences,
based on file loaded (for example vectors of length 50).

Skips words, that don't exist in mapping!
Therefore some possibly meaningful words (in security context) are skipped.
'''

from sklearn.svm import SVC
from nltk import word_tokenize

from data_helpers import clean_sentence

import re
word_pattern = re.compile(r'[^a-zA-Z0-9-]')

from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def documents_to_ave_embeddings(docs, embeddings):
    result = []
    for idx, doc in enumerate(docs):
        vectors = []
        
        new_doc = clean_sentence(doc)
        new_doc = re.sub(word_pattern, " ", new_doc)
        
        words = word_tokenize(new_doc)
        words = [word.lower() for word in words if len(word) > 1 and len(word) <= 25]
        
        for word in words:
            if word in embeddings.keys():
                vectors.append(embeddings[word])
          
        if vectors:
            result.append(np.average(vectors, axis=0))
        else:
            result.append(np.zeros(shape=result[0].shape))
    
    return result

In [52]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc, roc_auc_score, roc_curve

from sklearn.model_selection import GridSearchCV

In [53]:
from imblearn.over_sampling import SMOTE

tokens_count = 0
missing_tokens = 0

cat = 'Capability'
column = 'text-rel'

print(">>> {}\n".format(cat))

train = dataframes[cat]['Train']
test = dataframes[cat]['Test']

y_train, y_test = train['label_num'], test['label_num']

train[column] = train[column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
test[column] = test[column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

train_transformed = documents_to_ave_embeddings(train[column], mapping)
test_transformed = documents_to_ave_embeddings(test[column], mapping)

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {}".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
train_res, y_train_res = sm.fit_sample(train_transformed, y_train.ravel())

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

>>> Capability

Before OverSampling, counts of label '1': 2817
Before OverSampling, counts of label '0': 531
After OverSampling, counts of label '1': 2817
After OverSampling, counts of label '0': 2817


In [54]:
'''
Grid search because it's fancy.
'''

parameters = {
    'C' : np.linspace(2, 20, 10)
}
clf = GridSearchCV(
    LogisticRegression(), 
    parameters, 
#     cv=5, # means 5-fold cross validation
    scoring='f1',
    n_jobs=2,
    verbose=3)
clf.fit(train_res, y_train_res.ravel())

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=2.0 ...........................................................
[CV] C=2.0 ...........................................................
[CV] .................. C=2.0, score=0.7928531546621999, total=   1.0s
[CV] C=2.0 ...........................................................
[CV] .................. C=2.0, score=0.8434285714285713, total=   1.2s
[CV] C=4.0 ...........................................................
[CV] .................. C=2.0, score=0.8225712623618383, total=   1.0s
[CV] C=4.0 ...........................................................
[CV] ................... C=4.0, score=0.797532249018508, total=   1.3s
[CV] C=4.0 ...........................................................
[CV] .................... C=4.0, score=0.84505431675243, total=   1.5s
[CV] C=6.0 ...........................................................
[CV] .................. C=4.0, score=0.8288707799767172, total=   1.4s
[CV] C=6.0 .....

[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:   29.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'C': array([ 2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=3)

In [55]:
clf.best_params_

{'C': 16.0}

In [56]:
clf = LogisticRegression(
    C = 16.0
#     class_weight='balanced'
)
clf.fit(train_transformed, train['label_num']) # zwektoryzujmy dane i wytrenujmy klasyfikator na zbiorze treningowym

train_f1 = f1_score(clf.predict(train_transformed), train['label_num'])

y_pred = clf.predict(test_transformed)
test_f1 = f1_score(y_pred, test['label_num'])

print("\tBefore OverSampling:")
print("Train F1: {}".format(train_f1))
print("Test F1: {}".format(test_f1))

print(classification_report(test['label_num'], y_pred))

clf.fit(train_res, y_train_res.ravel())

train_f1 = f1_score(clf.predict(train_res), y_train_res.ravel())

y_pred = clf.predict(test_transformed)
test_f1 = f1_score(y_pred, test['label_num'])

print("\tAfter OverSampling:")
print("Train F1: {}".format(train_f1))
print("Test F1: {}".format(test_f1))

print(classification_report(test['label_num'], y_pred))
# c_m = confusion_matrix(test['label_num'], y_pred)
# tn, fp, fn, tp = c_m.ravel()
# # print("TN: {}, FP: {}, FN: {}, TP: {}".format(tn, fp, fn, tp))
# print(c_m)

	Before OverSampling:
Train F1: 0.9488823427482238
Test F1: 0.8972431077694237
             precision    recall  f1-score   support

          0       0.61      0.36      0.45        47
          1       0.86      0.94      0.90       190

avg / total       0.81      0.83      0.81       237

	After OverSampling:
Train F1: 0.9024168635289841
Test F1: 0.8670212765957447
             precision    recall  f1-score   support

          0       0.47      0.51      0.49        47
          1       0.88      0.86      0.87       190

avg / total       0.80      0.79      0.79       237



As we can see, OverSampling did help to recognize negative class for Capability. 
However - it is not necessarily what we expect, because f1-score for positive class decreased, 
so more positive classes will be omitted - which leads to no Capability-category for a given sentence.