In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import os


import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
!pip install gensim==3.8.1



In [3]:
!pip install xgboost



In [4]:
import xgboost as xgb

In [5]:
DATA_DIRECTORY = 'datasets/'

# Load up the Small BOW data

In [6]:
BOW_DUMP = "bow_sm.p"
bow_df = pickle.load(open(os.path.join(DATA_DIRECTORY, BOW_DUMP), "rb"))

In [7]:
bow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 4094 entries, class_x to aulaad
dtypes: int64(4093), object(1)
memory usage: 1.4+ GB


In [8]:
bow_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,aurato,minist,ghotala,banaya,bhool,shahe,mana,dhyan,jeeton,aulaad
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
bow_df.loc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
shahe                                                       0
mana                                                        0
dhyan                                                       0
jeeton                                                      0
aulaad                                                      0
Name: 201, Length: 4094, dtype: object

## Split the dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(bow_df.iloc[:, 3:], bow_df.class_x, test_size=0.2, 
                                                    stratify=bow_df.class_x, random_state=42)

In [8]:
len(X_train), len(y_train)

(35698, 35698)

In [9]:
len(X_test), len(y_test)

(8925, 8925)

## First lets try logisticregression

In [13]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', solver='saga', max_iter=1000).fit(X_train, y_train)

In [14]:
y_pred = lr_clf.predict(X_test)

In [15]:
lr_clf.score(X_test, y_test)

0.8746218487394958

In [16]:
lr_clf.score(X_train, y_train)

0.9154854613703849

In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.59      0.64      1384
           1       0.92      0.92      0.92      3838
           2       0.88      0.94      0.91      3703

    accuracy                           0.87      8925
   macro avg       0.83      0.81      0.82      8925
weighted avg       0.87      0.87      0.87      8925



## Next try random forests 

In [18]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1500, max_depth=6).fit(X_train, y_train)

In [19]:
rf_clf.score(X_test, y_test)

0.8019047619047619

#### Run some test runs with the grid

In [20]:
params = {"n_estimators": [500],
          "max_depth": [2, 4, 6, 8], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "class_weight": ["balanced", "balanced_subsample", None]}

In [21]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, )
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [22]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=4, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [23]:
rf_tuned.score(X_test, y_test)

0.836078431372549

In [24]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.65      0.61      1384
           1       0.93      0.89      0.91      3838
           2       0.85      0.85      0.85      3703

    accuracy                           0.84      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.84      0.84      8925



## XGBoost

In [25]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [28]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3
}

In [27]:
xgb_clf = xgb.train(params, dtrain)

n_gpus: 
	Deprecated. Single process multi-GPU training is no longer supported.
	Please switch to distributed training with one process per GPU.
	This can be done using Dask or Spark.  See documentation for details.


In [29]:
xgb_pred = xgb_clf.predict(dtest)

In [30]:
xgb_pred

array([2., 2., 1., ..., 2., 1., 1.], dtype=float32)

In [31]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.80      0.42      0.55      1384
           1       0.93      0.89      0.91      3838
           2       0.80      0.99      0.89      3703

    accuracy                           0.86      8925
   macro avg       0.85      0.76      0.78      8925
weighted avg       0.86      0.86      0.84      8925



### See if it can be tuned to better

In [10]:
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8],
     "gamma"            : np.linspace(.01, 1, 10, endpoint=True),
     "eval_metric"      : ['merror', 'mlogloss']
     }
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, n_thread=8)

In [11]:
grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3).fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid.best_estimator_

In [None]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

# Load up small tfidf

In [6]:
TFIDF_DUMP = "tfidf_sm.p"
tfidf_df = pickle.load(open(os.path.join(DATA_DIRECTORY, TFIDF_DUMP), "rb"))

In [7]:
tfidf_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,aurato,minist,ghotala,banaya,bhool,shahe,mana,dhyan,jeeton,aulaad
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0.43,0.49,0.48,0.41,0.33,0.27,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 4094 entries, class_x to aulaad
dtypes: float64(4091), int64(2), object(1)
memory usage: 1.4+ GB


In [9]:
tfidf_df.describe()

Unnamed: 0,class_x,id_x,woman,complain,clean,hous,man,trash,boi,dat,...,aurato,minist,ghotala,banaya,bhool,shahe,mana,dhyan,jeeton,aulaad
count,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,...,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0
mean,1.259844,61213.342917,0.001604,0.000735,0.00081,0.002046,0.005984,0.015889,0.003718,0.002905,...,8e-05,8.5e-05,7.9e-05,0.000113,0.000105,6.9e-05,7.4e-05,7.8e-05,6.5e-05,9.9e-05
std,0.708834,55165.070266,0.02547,0.018437,0.019804,0.028997,0.046652,0.083508,0.039312,0.034992,...,0.005429,0.005411,0.006761,0.005463,0.006472,0.004745,0.005178,0.005388,0.006162,0.006178
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,11442.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,22782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,119062.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,143870.0,0.89,0.91,0.95,0.93,1.0,1.0,0.92,0.93,...,0.51,0.51,0.79,0.38,0.61,0.46,0.51,0.49,0.6,0.47


## Split the dataset

In [10]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df.iloc[:, 3:], tfidf_df.class_x, test_size=0.2, 
                                                    stratify=tfidf_df.class_x, random_state=42)

In [11]:
len(X_train), len(y_train)

(35698, 35698)

In [12]:
len(X_test), len(y_test)

(8925, 8925)

## Start with same logisticregression

In [13]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000).fit(X_train, y_train)

In [14]:
y_pred = lr_clf.predict(X_test)

In [15]:
lr_clf.score(X_test, y_test)

0.8696918767507003

In [22]:
clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000)

In [23]:
params = {'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}

In [24]:
gs_clf = GridSearchCV(clf, params, n_jobs=2).fit(X_train, y_train)

In [25]:
gs_clf.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
gs_clf.score(X_test, y_test)

0.8696918767507003

In [27]:
y_pred = lr_clf.predict(X_test)

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.50      0.60      1384
           1       0.92      0.91      0.92      3838
           2       0.85      0.96      0.90      3703

    accuracy                           0.87      8925
   macro avg       0.84      0.79      0.81      8925
weighted avg       0.86      0.87      0.86      8925



## Random Forests

In [29]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500, max_depth=10,criterion='entropy',
                                class_weight="balanced", max_features="sqrt").fit(X_train, y_train)

In [30]:
rf_clf.score(X_test, y_test)

0.8383193277310924

In [31]:
y_pred = rf_clf.predict(X_test)

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.67      0.63      1384
           1       0.93      0.88      0.90      3838
           2       0.85      0.86      0.86      3703

    accuracy                           0.84      8925
   macro avg       0.79      0.80      0.80      8925
weighted avg       0.85      0.84      0.84      8925



In [33]:
params = {"max_depth": [2, 4, 6, 8, 10], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "n_estimators": [500, 1000, 1500], 
          "class_weight": ["balanced", "balanced_subsample", None]}

In [34]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [35]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='entropy',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [36]:
rf_tuned.score(X_test, y_test)

0.8382072829131653

In [37]:
y_pred = rf_tuned.predict(X_test)

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.66      0.62      1384
           1       0.93      0.89      0.91      3838
           2       0.86      0.86      0.86      3703

    accuracy                           0.84      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.84      0.84      8925



### XGBoost

In [39]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)