In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import os


import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
!pip install gensim==3.8.1



In [3]:
!pip install xgboost



In [2]:
import xgboost as xgb

In [3]:
DATA_DIRECTORY = 'datasets/'

# Load up the Large BOW data

In [4]:
BOW_DUMP = "bow_lg.p"
bow_df = pickle.load(open(os.path.join(DATA_DIRECTORY, BOW_DUMP), "rb"))

In [5]:
bow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 10375 entries, class_x to dadaji
dtypes: int64(10374), object(1)
memory usage: 3.4+ GB


In [6]:
bow_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,masla,kennedi,vyapam,bred,obc,bechari,marina,gana,dhani,dadaji
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
bow_df.loc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
bechari                                                     0
marina                                                      0
gana                                                        0
dhani                                                       0
dadaji                                                      0
Name: 201, Length: 10375, dtype: object

## Split the dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(bow_df.iloc[:, 3:], bow_df.class_x, test_size=0.2, 
                                                    stratify=bow_df.class_x, random_state=42)

In [8]:
len(X_train), len(y_train)

(35698, 35698)

In [9]:
len(X_test), len(y_test)

(8925, 8925)

## First lets try logisticregression

In [10]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', class_weight='balanced', solver='saga', max_iter=1000).fit(X_train, y_train)

In [11]:
y_pred = lr_clf.predict(X_test)

In [12]:
lr_clf.score(X_test, y_test)

0.871484593837535

In [13]:
lr_clf.score(X_train, y_train)

0.9374474760490784

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.75      0.68      1384
           1       0.95      0.88      0.92      3838
           2       0.90      0.91      0.91      3703

    accuracy                           0.87      8925
   macro avg       0.83      0.85      0.83      8925
weighted avg       0.88      0.87      0.88      8925



### Parameter tuning

In [8]:
clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000)

In [9]:
params = {'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
          'class_weight': [None, 'balanced']
         }

In [15]:
gs_clf = GridSearchCV(clf, params, n_jobs=1).fit(X_train, y_train)

In [16]:
gs_clf.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_pred = gs_clf.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.61      0.66      1384
           1       0.93      0.92      0.92      3838
           2       0.88      0.94      0.91      3703

    accuracy                           0.88      8925
   macro avg       0.84      0.82      0.83      8925
weighted avg       0.88      0.88      0.88      8925



## Next try random forests 

In [16]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1500, max_depth=6).fit(X_train, y_train)

In [17]:
rf_clf.score(X_test, y_test)

0.7859943977591036

#### Run some test runs with the grid

In [18]:
params = {"n_estimators": [500],
          "max_depth": [2, 4, 6, 8], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "class_weight": ["balanced", "balanced_subsample", None]}

In [19]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, )
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [20]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=8, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [21]:
rf_tuned.score(X_test, y_test)

0.8343977591036414

In [22]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.67      0.62      1384
           1       0.92      0.89      0.90      3838
           2       0.87      0.84      0.85      3703

    accuracy                           0.83      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.83      0.84      8925



## XGBoost

In [13]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [14]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3
}

In [15]:
xgb_clf = xgb.train(params, dtrain)

In [16]:
xgb_pred = xgb_clf.predict(dtest)

In [17]:
xgb_pred

array([2., 2., 1., ..., 2., 1., 1.], dtype=float32)

In [18]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.80      0.42      0.55      1384
           1       0.93      0.89      0.91      3838
           2       0.80      0.98      0.88      3703

    accuracy                           0.85      8925
   macro avg       0.84      0.76      0.78      8925
weighted avg       0.86      0.85      0.84      8925



### See if it can be tuned to better

In [20]:
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20 ] ,
     "max_depth"        : [ 5, 6, 8],
     "gamma"            : np.linspace(.01, 1, 10, endpoint=True),
     "eval_metric"      : ['merror', 'mlogloss']
     }
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3)

In [21]:
grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3).fit(X_train, y_train)

exception calling callback for <Future at 0x7fc91425c710 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 340, in __call__
    self.parallel.dispatch_next()
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 769, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 835, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 754, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py", line 551, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "/usr/local/lib/

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

In [None]:
grid.best_estimator_

In [None]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

# Load up large tfidf

In [15]:
TFIDF_DUMP = "tfidf_lg.p"
tfidf_df = pickle.load(open(os.path.join(DATA_DIRECTORY, TFIDF_DUMP), "rb"))

In [16]:
tfidf_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,masla,kennedi,vyapam,bred,obc,bechari,marina,gana,dhani,dadaji
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0.426186,0.486502,0.480812,0.411279,0.329805,0.269405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.250212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 10375 entries, class_x to dadaji
dtypes: float64(10372), int64(2), object(1)
memory usage: 3.4+ GB


In [21]:
tfidf_df.describe()

Unnamed: 0,class_x,id_x,woman,complain,clean,hous,man,trash,boi,dat,...,aurato,minist,ghotala,banaya,bhool,shahe,mana,dhyan,jeeton,aulaad
count,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,...,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0
mean,1.259844,61213.342917,0.001604,0.000735,0.00081,0.002046,0.005984,0.015889,0.003718,0.002905,...,8e-05,8.5e-05,7.9e-05,0.000113,0.000105,6.9e-05,7.4e-05,7.8e-05,6.5e-05,9.9e-05
std,0.708834,55165.070266,0.02547,0.018437,0.019804,0.028997,0.046652,0.083508,0.039312,0.034992,...,0.005429,0.005411,0.006761,0.005463,0.006472,0.004745,0.005178,0.005388,0.006162,0.006178
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,11442.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,22782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,119062.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,143870.0,0.89,0.91,0.95,0.93,1.0,1.0,0.92,0.93,...,0.51,0.51,0.79,0.38,0.61,0.46,0.51,0.49,0.6,0.47


## Split the dataset

In [18]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df.iloc[:, 3:], tfidf_df.class_x, test_size=0.2, 
                                                    stratify=tfidf_df.class_x, random_state=42)

In [19]:
len(X_train), len(y_train)

(35698, 35698)

In [20]:
len(X_test), len(y_test)

(8925, 8925)

## Start with same logisticregression

In [15]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', class_weight='balanced', 
                            solver='saga', max_iter=1000).fit(X_train, y_train)

In [16]:
y_pred = lr_clf.predict(X_test)

In [17]:
lr_clf.score(X_test, y_test)

0.8646498599439776

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.74      0.67      1384
           1       0.95      0.87      0.91      3838
           2       0.89      0.90      0.90      3703

    accuracy                           0.86      8925
   macro avg       0.82      0.84      0.83      8925
weighted avg       0.87      0.86      0.87      8925



In [16]:
clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000)

In [17]:
params = {'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}

In [19]:
gs_clf = GridSearchCV(clf, params, n_jobs=1).fit(X_train, y_train)

In [20]:
gs_clf.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
gs_clf.score(X_test, y_test)

0.8711484593837535

In [22]:
y_pred = lr_clf.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.51      0.61      1384
           1       0.93      0.91      0.92      3838
           2       0.85      0.96      0.90      3703

    accuracy                           0.87      8925
   macro avg       0.84      0.80      0.81      8925
weighted avg       0.87      0.87      0.86      8925



## Random Forests

In [21]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500, max_depth=10,criterion='entropy',
                                class_weight="balanced", max_features="sqrt").fit(X_train, y_train)

In [22]:
rf_clf.score(X_test, y_test)

0.8364145658263306

In [23]:
y_pred = rf_clf.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.68      0.62      1384
           1       0.92      0.89      0.90      3838
           2       0.87      0.84      0.85      3703

    accuracy                           0.84      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.84      0.84      8925



In [25]:
params = {"max_depth": [2, 4, 6, 8], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "n_estimators": [500, 1000], 
          "class_weight": ["balanced", "balanced_subsample", None]}

In [26]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [27]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=8, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [28]:
rf_tuned.score(X_test, y_test)

0.8339495798319327

In [29]:
y_pred = rf_tuned.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.67      0.62      1384
           1       0.91      0.89      0.90      3838
           2       0.87      0.84      0.85      3703

    accuracy                           0.83      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.83      0.84      8925



### XGBoost

In [30]:
counts = y_train.value_counts()

In [39]:
weights = y_train.apply(lambda x: 1 / (counts[x] / counts[0]))

In [40]:
dtrain = xgb.DMatrix(data=X_train, label=y_train, weight=weights)
dtest = xgb.DMatrix(data=X_test)

In [41]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3
}

In [42]:
xgb_clf = xgb.train(params, dtrain)

In [43]:
xgb_pred = xgb_clf.predict(dtest)

In [44]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.64      0.61      0.62      1384
           1       0.95      0.85      0.90      3838
           2       0.82      0.92      0.87      3703

    accuracy                           0.84      8925
   macro avg       0.80      0.79      0.80      8925
weighted avg       0.85      0.84      0.84      8925

