In [1]:
import pandas as pd
import numpy as np
import re
import dill
import tqdm
import gc
import gensim

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
from gensim.models import Word2Vec

model = Word2Vec.load('./basic_w2v_sg10ep.bin')
print(model)

2019-02-05 05:58:03,252 : INFO : loading Word2Vec object from ./basic_w2v_sg10ep.bin
2019-02-05 05:58:03,549 : INFO : loading wv recursively from ./basic_w2v_sg10ep.bin.wv.* with mmap=None
2019-02-05 05:58:03,550 : INFO : loading vectors from ./basic_w2v_sg10ep.bin.wv.vectors.npy with mmap=None
2019-02-05 05:58:03,620 : INFO : setting ignored attribute vectors_norm to None
2019-02-05 05:58:03,621 : INFO : loading vocabulary recursively from ./basic_w2v_sg10ep.bin.vocabulary.* with mmap=None
2019-02-05 05:58:03,622 : INFO : loading trainables recursively from ./basic_w2v_sg10ep.bin.trainables.* with mmap=None
2019-02-05 05:58:03,622 : INFO : loading syn1neg from ./basic_w2v_sg10ep.bin.trainables.syn1neg.npy with mmap=None
2019-02-05 05:58:03,694 : INFO : setting ignored attribute cum_table to None
2019-02-05 05:58:03,694 : INFO : loaded ./basic_w2v_sg10ep.bin


Word2Vec(vocab=148741, size=300, alpha=0.025)


In [6]:
with open('Xtrain_norm.pkl', 'rb') as f:
    X_train = []
    while True:
        try:
            X_train.extend(dill.load(f))
        except:
            print('EOF reached')
            break
            
with open('Xtest_norm.pkl', 'rb') as f:
    X_test = []
    while True:
        try:
            X_test.extend(dill.load(f))
        except:
            print('EOF reached')
            break
            
with open('ytrain_labels.pkl', 'rb') as f:
    y_train = dill.load(f)
    
with open('ytest_labels.pkl', 'rb') as f:
    y_test = dill.load(f)
    
len(X_train), len(X_test), len(y_train), len(y_test)

EOF reached
EOF reached


(481390, 120348, 481390, 120348)

In [7]:
import nltk
# tokenize sentences in corpus
wpt = nltk.WordPunctTokenizer()
tokenized_train = [wpt.tokenize(document) for document in X_train]
tokenized_test = [wpt.tokenize(document) for document in X_test]

In [8]:
model.vector_size

300

In [11]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

 
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [12]:
# get document level embeddings
train_features = averaged_word_vectorizer(corpus=tokenized_train, model=model,
                                             num_features=model.vector_size)

test_features = averaged_word_vectorizer(corpus=tokenized_test, model=model,
                                             num_features=model.vector_size)

train_features.shape, test_features.shape

((481390, 300), (120348, 300))

In [13]:
with open('./Xtrain_embeddings_w2v.pkl', 'wb') as f:
    dill.dump(train_features, f)
    
with open('./Xtest_embeddings_w2v.pkl', 'wb') as f:
    dill.dump(test_features, f)

In [46]:
train_positives = []
y_train_positives = []
for idx, label in enumerate(y_train):
    if label != 0:
        train_positives.append(train_features[idx])
        y_train_positives.append(label)

test_positives = []
y_test_positives = []
for idx, label in enumerate(y_test):
    if label != 0:
        test_positives.append(test_features[idx])
        y_test_positives.append(label)
        
train_positives = np.array(train_positives)
y_train_positives = np.array(y_train_positives)
test_positives = np.array(test_positives)
y_test_positives = np.array(y_test_positives)
train_positives.shape, y_train_positives.shape, test_positives.shape, y_test_positives.shape

((67400, 300), (67400,), (16851, 300), (16851,))

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

lr = GradientBoostingClassifier(n_estimators=50, verbose=2)
lr.fit(train_positives, y_train_positives)

      Iter       Train Loss   Remaining Time 
         1           0.0816            1.66m
         2           0.0785            1.67m
         3           0.0775            1.67m
         4           0.0757            1.65m
         5           0.0748            1.62m
         6           0.0735            1.59m
         7           0.0727            1.56m
         8           0.0717            1.53m
         9           0.0710            1.50m
        10           0.0705            1.47m
        11           0.0698            1.44m
        12           0.0688            1.42m
        13           0.0686            1.39m
        14           0.0680            1.36m
        15           0.0675            1.32m
        16           0.0671            1.28m
        17           0.0667            1.24m
        18           0.0660            1.20m
        19           0.0655            1.17m
        20           0.0650            1.13m
        21           0.0646            1.09m
        2

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=2,
              warm_start=False)

In [33]:
import model_evaluation_utils as meu

y_pred = lr.predict(test_positives)
meu.display_model_performance_metrics(true_labels=y_test_positives, predicted_labels=y_pred, 
                                      classes=list(set(y_test_positives)))

Model Performance metrics:
------------------------------
Accuracy: 0.993
Precision: 0.9886
Recall: 0.993
F1 Score: 0.9905

Model Classification report:
------------------------------
             precision    recall  f1-score   support

          1       0.99      1.00      1.00     16744
          2       0.18      0.03      0.05       107

avg / total       0.99      0.99      0.99     16851


Prediction Confusion Matrix:
------------------------------
          Predicted:    
                   1   2
Actual: 1      16730  14
        2        104   3


In [36]:
from sklearn.svm import OneClassSVM

osvm = OneClassSVM()
osvm.fit(train_positives)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.5, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [42]:
preds = osvm.predict(test_positives)
preds = [1 if item == -1 else 2 for item in preds]

In [43]:
from collections import Counter
Counter(preds)

Counter({1: 10566, 2: 6285})

In [44]:
meu.display_model_performance_metrics(true_labels=y_test_positives, predicted_labels=preds, 
                                      classes=list(set(y_test_positives)))

Model Performance metrics:
------------------------------
Accuracy: 0.6273
Precision: 0.9889
Recall: 0.6273
F1 Score: 0.7653

Model Classification report:
------------------------------
             precision    recall  f1-score   support

          1       1.00      0.63      0.77     16744
          2       0.01      0.52      0.02       107

avg / total       0.99      0.63      0.77     16851


Prediction Confusion Matrix:
------------------------------
          Predicted:      
                   1     2
Actual: 1      10515  6229
        2         51    56


In [47]:
import xgboost as xgb

In [49]:
xgc = xgb.XGBClassifier(max_depth=5, n_jobs=4, verbose=1)
xgc.fit(train_positives, y_train_positives)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1, verbose=2)

In [50]:
y_pred = lr.predict(test_positives)
meu.display_model_performance_metrics(true_labels=y_test_positives, predicted_labels=y_pred, 
                                      classes=list(set(y_test_positives)))

Model Performance metrics:
------------------------------
Accuracy: 0.993
Precision: 0.9886
Recall: 0.993
F1 Score: 0.9905

Model Classification report:
------------------------------
             precision    recall  f1-score   support

          1       0.99      1.00      1.00     16744
          2       0.18      0.03      0.05       107

avg / total       0.99      0.99      0.99     16851


Prediction Confusion Matrix:
------------------------------
          Predicted:    
                   1   2
Actual: 1      16730  14
        2        104   3
