In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import os


import matplotlib.pyplot as plt

%matplotlib inline

In [10]:
!pip install gensim==3.8.1

Collecting gensim==3.8.1
  Downloading gensim-3.8.1-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 24.8 MB/s eta 0:00:01
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.2
    Uninstalling gensim-3.8.2:
      Successfully uninstalled gensim-3.8.2
Successfully installed gensim-3.8.1


In [7]:
!pip install xgboost



In [3]:
import xgboost as xgb

# Load up the BOW data

In [4]:
DATA_DIRECTORY = 'datasets/'

In [5]:
BOW_DUMP = "bow.p"
bow_df = pickle.load(open(os.path.join(DATA_DIRECTORY, BOW_DUMP), "rb"))

In [6]:
bow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 6898 entries, class_x to pandito
dtypes: int64(6897), object(1)
memory usage: 2.3+ GB


In [6]:
bow_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
bow_df.loc[201]

class_x                                                     1
tweet_x     "@Nicholas_ted33: Kobe stay talking trash. But...
id_x                                                      203
woman                                                       0
complain                                                    0
                                  ...                        
_ia__ii                                                     0
vow                                                         0
haalat                                                      0
garib                                                       0
pandito                                                     0
Name: 201, Length: 6898, dtype: object

## Split the dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(bow_df.iloc[:, 3:], bow_df.class_x, test_size=0.2, 
                                                    stratify=bow_df.class_x, random_state=42)

In [8]:
len(X_train), len(y_train)

(35698, 35698)

In [9]:
len(X_test), len(y_test)

(8925, 8925)

## First lets try logisticregression

In [15]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', solver='saga', max_iter=1000).fit(X_train, y_train)

In [16]:
y_pred = lr_clf.predict(X_test)

In [17]:
lr_clf.score(X_test, y_test)

0.8782072829131653

In [18]:
lr_clf.score(X_train, y_train)

0.9292957588660429

In [19]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.60      0.65      1384
           1       0.93      0.92      0.92      3838
           2       0.88      0.94      0.91      3703

    accuracy                           0.88      8925
   macro avg       0.84      0.82      0.83      8925
weighted avg       0.87      0.88      0.88      8925



## Next try random forests 

In [12]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1500, max_depth=6).fit(X_train, y_train)

In [13]:
rf_clf.score(X_test, y_test)

0.7992156862745098

#### Run some test runs with the grid

In [10]:
params = {"n_estimators": [500],
          "max_depth": [2, 4, 6, 8], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "class_weight": ["balanced", "balanced_subsample", None]}

In [11]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1, )
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [12]:
rf_tuned.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='entropy',
                       max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=500, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [13]:
rf_tuned.score(X_test, y_test)

0.8349579831932773

In [14]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.67      0.62      1384
           1       0.92      0.89      0.90      3838
           2       0.86      0.84      0.85      3703

    accuracy                           0.83      8925
   macro avg       0.79      0.80      0.79      8925
weighted avg       0.84      0.83      0.84      8925



## XGBoost

In [12]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test)

In [13]:
params = {
    'max_depth': 6,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 3,
    'n_gpus': 0
}

In [14]:
xgb_clf = xgb.train(params, dtrain)

In [16]:
xgb_pred = xgb_clf.predict(dtest)

In [17]:
xgb_pred

array([2., 2., 1., ..., 2., 1., 1.], dtype=float32)

In [19]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.80      0.42      0.55      1384
           1       0.93      0.89      0.91      3838
           2       0.80      0.99      0.89      3703

    accuracy                           0.86      8925
   macro avg       0.85      0.76      0.78      8925
weighted avg       0.86      0.86      0.84      8925



### See if it can be tuned to better

In [20]:
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : np.linspace(.01, 1, 10, endpoint=True),
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3)

In [None]:
grid = GridSearchCV(clf,
                    parameters, n_jobs=1,
                    scoring="neg_log_loss",
                    cv=3).fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

# Load up tfidf

In [4]:
TFIDF_DUMP = "tfidf.p"
tfidf_df = pickle.load(open(os.path.join(DATA_DIRECTORY, TFIDF_DUMP), "rb"))

In [5]:
tfidf_df.head()

Unnamed: 0,class_x,tweet_x,id_x,woman,complain,clean,hous,man,trash,boi,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0.43,0.49,0.48,0.41,0.33,0.27,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44623 entries, 0 to 44622
Columns: 6898 entries, class_x to pandito
dtypes: float64(6895), int64(2), object(1)
memory usage: 2.3+ GB


In [32]:
tfidf_df.describe()

Unnamed: 0,class_x,id_x,woman,complain,clean,hous,man,trash,boi,dat,...,kaal,kundali,aja,jesa,gau,_ia__ii,vow,haalat,garib,pandito
count,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,...,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0,44623.0
mean,1.259844,61213.342917,0.001527,0.000714,0.000781,0.001954,0.005665,0.014549,0.003576,0.002822,...,4e-05,3.4e-05,5.1e-05,3.1e-05,4.3e-05,2.2e-05,4.7e-05,3.2e-05,3.6e-05,3.2e-05
std,0.708834,55165.070266,0.024284,0.017999,0.019126,0.027862,0.044129,0.076081,0.03811,0.034088,...,0.003962,0.005135,0.005089,0.003346,0.003757,0.004545,0.004792,0.00309,0.003487,0.003048
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,11442.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,22782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,119062.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,143870.0,0.89,0.91,0.95,0.93,1.0,1.0,0.92,0.93,...,0.54,0.82,0.66,0.45,0.43,0.96,0.75,0.36,0.43,0.34


## Split the dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df.iloc[:, 3:], tfidf_df.class_x, test_size=0.2, 
                                                    stratify=tfidf_df.class_x, random_state=42)

In [10]:
len(X_train), len(y_train)

(35698, 35698)

In [11]:
len(X_test), len(y_test)

(8925, 8925)

## Start with same logisticregression

In [21]:
lr_clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000).fit(X_train, y_train)

In [22]:
y_pred = lr_clf.predict(X_test)

In [23]:
lr_clf.score(X_test, y_test)

0.8715966386554622

In [14]:
clf = LogisticRegression(random_state=42, multi_class='multinomial', max_iter=1000)

In [15]:
params = {'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}

In [17]:
gs_clf = GridSearchCV(clf, params, n_jobs=2).fit(X_train, y_train)

In [18]:
gs_clf.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
gs_clf.score(X_test, y_test)

0.8715966386554622

In [24]:
y_pred = lr_clf.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.51      0.61      1384
           1       0.92      0.91      0.92      3838
           2       0.85      0.96      0.90      3703

    accuracy                           0.87      8925
   macro avg       0.84      0.80      0.81      8925
weighted avg       0.87      0.87      0.86      8925



## Random Forests

In [14]:
rf_clf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500, max_depth=10,criterion='entropy',
                                class_weight="balanced", max_features="sqrt").fit(X_train, y_train)

In [15]:
rf_clf.score(X_test, y_test)

0.8398879551820728

In [16]:
y_pred = rf_clf.predict(X_test)

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.66      0.62      1384
           1       0.92      0.89      0.91      3838
           2       0.86      0.85      0.86      3703

    accuracy                           0.84      8925
   macro avg       0.79      0.80      0.80      8925
weighted avg       0.85      0.84      0.84      8925



In [11]:
params = {"max_depth": [2, 4, 6, 8, 10], 
          "criterion": ["gini", "entropy"],
          "max_features": ["sqrt", "log2"],
          "n_estimators": [500, 1000, 1500], 
          "class_weight": ["balanced", "balanced_subsample", None]}

In [None]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_tuned = GridSearchCV(rf, params).fit(X_train, y_train)


In [None]:
rf_tuned.best_estimator_

In [None]:
rf_tuned.score(X_test, y_test)

# Word2Vec Experiments

In [11]:
from gensim.models import Word2Vec

In [12]:
model_w2v = Word2Vec(tokenized_questions, size=300)

NameError: name 'gensim' is not defined