In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

import os


import matplotlib.pyplot as plt

%matplotlib inline

In [10]:
!pip install gensim==3.8.1

Collecting gensim==3.8.1
  Downloading gensim-3.8.1-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 24.8 MB/s eta 0:00:01
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.2
    Uninstalling gensim-3.8.2:
      Successfully uninstalled gensim-3.8.2
Successfully installed gensim-3.8.1


In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.0.2-py3-none-manylinux1_x86_64.whl (109.7 MB)
[K     |████████████████████████████████| 109.7 MB 19 kB/s s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.0.2


In [2]:
import xgboost as xgb

In [3]:
DATA_DIRECTORY = 'datasets/'

# Load up the BOW data

In [4]:
BOW_DUMP = "bow.p"
bow_df = pickle.load(open(os.path.join(DATA_DIRECTORY, BOW_DUMP), "rb"))

In [5]:
bow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 6898 entries, class_x to pandito
dtypes: int64(6897), object(1)
memory usage: 2.3+ GB


In [7]:
bow_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
bow_df.loc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
_ia__ii                                                     0
vow                                                         0
haalat                                                      0
garib                                                       0
pandito                                                     0
Name: 201, Length: 6898, dtype: object

## Split the dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(bow_df.iloc[:, 3:], bow_df.class_x, test_size=0.2, 
                                                    stratify=bow_df.class_x, random_state=42)

In [8]:
X_train.shape

(35698, 6895)

In [9]:
len(X_train), len(y_train)

(35698, 35698)

In [10]:
len(X_test), len(y_test)

(8925, 8925)

## First lets try logisticregression

In [19]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', class_weight='balanced', solver='saga', max_iter=1000).fit(X_train, y_train)

In [20]:
y_pred = lr_clf.predict(X_test)

In [21]:
lr_clf.score(X_test, y_test)

0.8676750700280113

In [22]:
lr_clf.score(X_train, y_train)

0.9271387752815283

In [23]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.74      0.67      1384
           1       0.95      0.88      0.91      3838
           2       0.90      0.90      0.90      3703

    accuracy                           0.87      8925
   macro avg       0.82      0.84      0.83      8925
weighted avg       0.88      0.87      0.87      8925



### Parameter tuning

In [13]:
clf = LogisticRegression(random_state=42, multi_class='multinomial', class_weight='balanced', solver='saga', max_iter=1000)

In [14]:
params = {'C': [0.1, 0.5, 0.74, 1.0, 5.0]}

In [15]:
gs_clf = GridSearchCV(clf, params, n_jobs=3).fit(X_train, y_train)

In [16]:
gs_clf.best_estimator_

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=42, solver='saga', tol=0.0001,
                   verbose=0, warm_start=False)

In [17]:
y_pred = gs_clf.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.74      0.67      1384
           1       0.95      0.88      0.91      3838
           2       0.90      0.90      0.90      3703

    accuracy                           0.87      8925
   macro avg       0.82      0.84      0.83      8925
weighted avg       0.88      0.87      0.87      8925



## Next try random forests 

In [12]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1500, max_depth=6).fit(X_train, y_train)

In [13]:
rf_clf.score(X_test, y_test)

0.7992156862745098

#### Run some test runs with the grid

In [10]:
params = {"n_estimators": [500],
          "max_depth": [2, 4, 6, 8], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "class_weight": ["balanced", "balanced_subsample", None]}

In [11]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, )
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [12]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='entropy',
                       max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [13]:
rf_tuned.score(X_test, y_test)

0.8349579831932773

In [14]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.67      0.62      1384
           1       0.92      0.89      0.90      3838
           2       0.86      0.84      0.85      3703

    accuracy                           0.83      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.83      0.84      8925



## XGBoost

In [4]:
counts = y_train.value_counts()

NameError: name 'y_train' is not defined

In [64]:
weights = y_train.apply(lambda x: 1 / (counts[x] / counts[0]))

In [66]:
dtrain = xgb.DMatrix(data=X_train, label=y_train, weight=weights)
dtest = xgb.DMatrix(data=X_test)

In [90]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3,
    'booster': 'dart',
    'eval_metric': 'merror',
    'gamma': 0.12,
    'random_state': 42,
    'num_boost_round': 100
}

In [91]:
xgb_clf = xgb.train(params, dtrain)

Parameters: { num_boost_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [92]:
xgb_pred = xgb_clf.predict(dtest)

In [93]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.55      0.75      0.63      1383
           1       0.90      0.85      0.87      3837
           2       0.87      0.80      0.83      3689

    accuracy                           0.81      8909
   macro avg       0.77      0.80      0.78      8909
weighted avg       0.83      0.81      0.82      8909



In [83]:
xgb_clf

<xgboost.core.Booster at 0x7f768bd8d470>

### See if it can be tuned to better

In [94]:
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20 ] ,
     #"max_depth"        : [ 3, 4, 5, 6, 8],
     #"gamma"            : np.linspace(.01, 1, 10, endpoint=True),
     #"eval_metric"      : ['merror', 'mlogloss']
     #"booster": ["gbtree", "gblinear", "dart"]
     }
# This is our best
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, max_depth=6, gamma=0.12, booster='dart',
                        eval_metric='merror', random_state=42)

In [95]:
estimator = clf.fit(X_train, y_train)

In [96]:
y_pred = estimator.predict(X_test)

In [101]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.69      0.73      1383
           1       0.93      0.91      0.92      3837
           2       0.88      0.94      0.91      3689

    accuracy                           0.89      8909
   macro avg       0.86      0.85      0.86      8909
weighted avg       0.89      0.89      0.89      8909



### The best we have found lets train on the whole dataset and drop to disk

In [11]:
clf_final = xgb.XGBClassifier(objective='multi:softmax', num_class=3, max_depth=6, gamma=0.12, booster='dart',
                        eval_metric='merror', random_state=42).fit(bow_df.iloc[:, 3:], bow_df.class_x)

In [105]:
joblib.dump(clf_final, os.path.join(DATA_DIRECTORY, "bow_xgb.joblib"))

['datasets/bow_xgb.joblib']

In [98]:
grid = GridSearchCV(clf,
                    parameters, n_jobs=1,
                    scoring="neg_log_loss",
                    cv=3).fit(X_train, y_train)

In [99]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.05,
              eval_metric='merror', gamma=0.12, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_class=3, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [100]:
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.69      0.73      1383
           1       0.93      0.91      0.92      3837
           2       0.88      0.94      0.91      3689

    accuracy                           0.89      8909
   macro avg       0.86      0.85      0.86      8909
weighted avg       0.89      0.89      0.89      8909



# Load up tfidf

In [4]:
TFIDF_DUMP = "tfidf.p"
tfidf_df = pickle.load(open(os.path.join(DATA_DIRECTORY, TFIDF_DUMP), "rb"))

In [5]:
tfidf_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0.43,0.49,0.48,0.41,0.33,0.27,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 6898 entries, class_x to pandito
dtypes: float64(6895), int64(2), object(1)
memory usage: 2.3+ GB


In [8]:
tfidf_df.describe()

Unnamed: 0,class_x,id_x,woman,complain,clean,hous,man,trash,boi,dat,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
count,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,...,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0
mean,1.259844,61213.342917,0.001527,0.000714,0.000781,0.001954,0.005665,0.014549,0.003576,0.002822,...,4e-05,3.4e-05,5.1e-05,3.1e-05,4.3e-05,2.2e-05,4.7e-05,3.2e-05,3.6e-05,3.2e-05
std,0.708834,55165.070266,0.024284,0.017999,0.019126,0.027862,0.044129,0.076081,0.03811,0.034088,...,0.003962,0.005135,0.005089,0.003346,0.003757,0.004545,0.004792,0.00309,0.003487,0.003048
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,11442.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,22782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,119062.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,143870.0,0.89,0.91,0.95,0.93,1.0,1.0,0.92,0.93,...,0.54,0.82,0.66,0.45,0.43,0.96,0.75,0.36,0.43,0.34


## Split the dataset

In [6]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df.iloc[:, 3:], tfidf_df.class_x, test_size=0.2, 
                                                    stratify=tfidf_df.class_x, random_state=42)

In [7]:
len(X_train), len(y_train)

(35698, 35698)

In [8]:
len(X_test), len(y_test)

(8925, 8925)

## Start with same logisticregression

In [10]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', class_weight='balanced', max_iter=1000).fit(X_train, y_train)

In [11]:
y_pred = lr_clf.predict(X_test)

In [12]:
lr_clf.score(X_test, y_test)

0.8638655462184874

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.74      0.67      1384
           1       0.95      0.87      0.91      3838
           2       0.89      0.90      0.90      3703

    accuracy                           0.86      8925
   macro avg       0.82      0.84      0.83      8925
weighted avg       0.87      0.86      0.87      8925



### Parameter tuning

In [17]:
clf = LogisticRegression(random_state=42, multi_class='multinomial', solver='saga', max_iter=500)

In [18]:
params = {'class_weight': [None, 'balanced'],
         'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0]
         }

In [19]:
gs_clf = GridSearchCV(clf, params, n_jobs=2).fit(X_train, y_train)

In [20]:
gs_clf.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
gs_clf.score(X_test, y_test)

0.8715966386554622

In [22]:
y_pred = lr_clf.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.74      0.67      1384
           1       0.95      0.87      0.91      3838
           2       0.89      0.90      0.90      3703

    accuracy                           0.86      8925
   macro avg       0.82      0.84      0.83      8925
weighted avg       0.87      0.86      0.87      8925



## Random Forests

In [14]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500, max_depth=10,criterion='entropy',
                                class_weight="balanced", max_features="sqrt").fit(X_train, y_train)

In [15]:
rf_clf.score(X_test, y_test)

0.8398879551820728

In [16]:
y_pred = rf_clf.predict(X_test)

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.66      0.62      1384
           1       0.92      0.89      0.91      3838
           2       0.86      0.85      0.86      3703

    accuracy                           0.84      8925
   macro avg       0.79      0.80      0.80      8925
weighted avg       0.85      0.84      0.84      8925



In [18]:
params = {"max_depth": [2, 4, 6, 8], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "n_estimators": [500], 
          "class_weight": ["balanced", "balanced_subsample", None]}

In [19]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [20]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=8, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [21]:
rf_tuned.score(X_test, y_test)

0.8341736694677871

In [24]:
rf_pred = rf_tuned.predict(X_test)

In [25]:
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.58      0.67      0.62      1384
           1       0.92      0.89      0.90      3838
           2       0.86      0.84      0.85      3703

    accuracy                           0.83      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.83      0.84      8925



## XGBoost

In [9]:
counts = y_train.value_counts()

In [9]:
weights = y_train.apply(lambda x: 1 / (counts[x] / counts[0]))

In [10]:
dtrain = xgb.DMatrix(data=X_train, label=y_train, weight=weights)
dtest = xgb.DMatrix(data=X_test)

In [14]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3,
    'booster': 'dart',
    'eval_metric': 'merror',
    'gamma': 0.12,
    'random_state': 42
}

In [15]:
xgb_clf = xgb.train(params, dtrain)

Parameters: { num_boost_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [16]:
xgb_pred = xgb_clf.predict(dtest)

In [17]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.64      0.62      0.63      1384
           1       0.95      0.85      0.90      3838
           2       0.82      0.92      0.87      3703

    accuracy                           0.84      8925
   macro avg       0.80      0.80      0.80      8925
weighted avg       0.85      0.84      0.84      8925



### Tune to see if we can do better


In [10]:
parameters = {
     # "eta"    : [0.05, 0.10, 0.15, 0.20 ] ,
     #"max_depth": [3, 4, 6, 8],
     #"min_child_weight": [1, 3, 5]
     #"gamma":[i/10.0 for i in range(0,5)]
#      'subsample':[i/10.0 for i in range(6,10)],
#      'colsample_bytree':[i/10.0 for i in range(6,10)],
     'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
     #"eval_metric"      : ['merror', 'mlogloss']
     #"booster": ["gbtree", "gblinear", "dart"]
     #"n_estimators": [100, 250, 500]
     }
# This is our best
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42, eta=0.3, 
                        eval_metric='mlogloss', n_estimators=500, nthreads=15, booster="dart",
                        max_depth=8, min_child_weight=1, gamma=0.2, colsample_bytree=0.8, subsample=0.9)

In [11]:
estimator = GridSearchCV(clf, parameters).fit(X_train, y_train)

In [12]:
estimator.best_estimator_

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3,
              eval_metric='mlogloss', gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=0, nthreads=15, num_class=3,
              num_parallel_tree=1, objective='multi:softprob', random_state=42,
              reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=None,
              subsample=0.9, tree_method=None, validate_parameters=False, ...)

In [13]:
y_pred = estimator.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.65      0.70      1384
           1       0.94      0.92      0.93      3838
           2       0.89      0.95      0.92      3703

    accuracy                           0.89      8925
   macro avg       0.86      0.84      0.85      8925
weighted avg       0.89      0.89      0.89      8925



# Word2Vec Experiments

In [26]:
WORD2VECDF_DUMP = "word2vecdf.p"

In [27]:
w2v_df = pickle.load(open(os.path.join(DATA_DIRECTORY, WORD2VECDF_DUMP), "rb"))

In [28]:
w2v_df.head()

Unnamed: 0,class,tweet,id,0,1,2,3,4,5,6,...,190,191,192,193,194,195,196,197,198,199
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,-0.317924,-0.174421,-0.045998,0.131469,0.153139,0.080083,0.584329,...,0.157028,-0.575342,-0.052679,-0.507448,-0.127183,-0.679388,-0.129394,-0.176748,-0.572039,-0.453219
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,-0.078156,-0.208762,-0.378451,-0.032828,-0.173598,-0.16706,0.252888,...,-0.108476,-0.406503,-0.200132,0.043845,0.161172,-0.056986,0.071785,-0.313311,0.081915,-0.199177
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.220351,-0.150742,-0.16655,0.115838,-0.122622,-0.177487,0.275816,...,-0.27164,0.081974,-0.11668,0.230674,0.222533,0.020707,-0.311411,-0.388627,-0.204376,0.092579
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.283788,-0.28506,-0.303382,0.952025,-0.227425,-0.579436,0.594272,...,-0.106376,0.018728,0.241592,0.328473,-0.488023,-0.616264,0.069447,-0.652167,-0.401744,0.058771
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,-0.427031,0.184674,-0.950997,0.251101,0.029121,-0.074548,0.771606,...,0.225774,0.212599,-0.077561,0.066444,0.108355,-0.243325,-0.123698,-0.511223,-0.314331,0.015501


## Split the dataset

In [29]:
X_train, X_test, y_train, y_test = train_test_split(w2v_df.iloc[:, 3:], w2v_df['class'], test_size=0.2, 
                                                    stratify=w2v_df['class'], random_state=42)

In [30]:
len(X_train), len(y_train)

(35632, 35632)

In [31]:
len(X_test), len(y_test)

(8909, 8909)

## LogisticRegression

In [21]:
w2v_lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000).fit(X_train, y_train)

In [22]:
w2v_y_pred = w2v_lr_clf.predict(X_test)

In [23]:
w2v_lr_clf.score(X_test, y_test)

0.8401616343023909

In [24]:
params = {
    'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
    'C': [0.1, 0.5, 1.0, 5.0, 10.0, 50.0],
    'class_weight': [None, 'balanced']
}

In [25]:
w2v_gs_clf = GridSearchCV(w2v_lr_clf, params, n_jobs=-1).fit(X_train, y_train)

In [26]:
w2v_y_pred = w2v_gs_clf.predict(X_test)

In [27]:
w2v_gs_clf.score(X_test, y_test)

0.8402738803457178

In [28]:
print(classification_report(y_test, w2v_y_pred))

              precision    recall  f1-score   support

           0       0.68      0.47      0.55      1383
           1       0.89      0.90      0.89      3837
           2       0.83      0.92      0.88      3689

    accuracy                           0.84      8909
   macro avg       0.80      0.76      0.77      8909
weighted avg       0.83      0.84      0.83      8909



In [29]:
w2v_gs_clf.best_estimator_

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

## Random Forests

In [30]:
w2v_rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1500, max_depth=6).fit(X_train, y_train)

In [31]:
w2v_rf_clf.score(X_test, y_test)

0.7978448759681221

#### Tune hyperparamters

In [13]:
params = {"n_estimators": [100, 500, 1500],
          "max_depth": [2, 4, 6, 8, 10], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "class_weight": ["balanced", "balanced_subsample", None]}

In [14]:
w2v_rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [15]:
w2v_tuned = GridSearchCV(w2v_rf_clf, params).fit(X_train, y_train)

In [16]:
y_pred = w2v_tuned.predict(X_test)

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.72      0.67      1383
           1       0.90      0.87      0.88      3837
           2       0.86      0.84      0.85      3689

    accuracy                           0.83      8909
   macro avg       0.79      0.81      0.80      8909
weighted avg       0.84      0.83      0.84      8909



In [18]:
w2v_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

## XGBoost

In [14]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [20]:
params = {
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3,
    'n_gpus': 0
}

In [21]:
w2v_xgb_clf = xgb.train(params, dtrain)

In [22]:
w2v_xgb_pred = w2v_xgb_clf.predict(dtest)

In [23]:
print(classification_report(y_test, w2v_xgb_pred))

              precision    recall  f1-score   support

           0       0.77      0.41      0.54      1383
           1       0.87      0.88      0.88      3837
           2       0.81      0.93      0.86      3689

    accuracy                           0.83      8909
   macro avg       0.82      0.74      0.76      8909
weighted avg       0.83      0.83      0.82      8909



### Tune to see what we can do

In [26]:
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10],
     "gamma"            : np.linspace(.01, 1, 10, endpoint=True),
     "eval_metric"      : ['merror', 'mlogloss']
     }
w2v_xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, n_threads=8)

In [27]:
grid = GridSearchCV(w2v_xgb_clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3).fit(X_train, y_train)

In [28]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.15,
              eval_metric='merror', gamma=0.89, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.150000006, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, n_threads=8, num_class=3,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, ...)

In [29]:
params = {
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3,
    'n_gpus': 0,
    'eta': 0.15,
    'eval_metric': 'merror',
    'gamma': 0.89,
    'max_depth': 10,
    'random_state': 42
}

In [30]:
w2v_xgb_clf = xgb.train(params, dtrain)

In [31]:
w2v_xgb_pred = w2v_xgb_clf.predict(dtest)

In [32]:
print(classification_report(y_test, w2v_xgb_pred))

              precision    recall  f1-score   support

           0       0.78      0.50      0.61      1383
           1       0.88      0.89      0.88      3837
           2       0.83      0.94      0.88      3689

    accuracy                           0.85      8909
   macro avg       0.83      0.77      0.79      8909
weighted avg       0.84      0.85      0.84      8909

