# Import Packages

In [108]:
# Basic Packages
import pandas as pd
import numpy as np

# NLP Packages
from nltk.corpus import stopwords

# Sklearn Packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
from xgboost import XGBClassifier


import pickle

!pip install xgboost



## Importing Train and Test Set

In [98]:
X_train = pickle.load(open('../pickle/X_train_tfidf.pkl', 'rb'))
X_test = pickle.load(open('../pickle/X_test_tfidf.pkl', 'rb'))
y_train = pd.read_pickle('../pickle/y_train.pkl')
y_test = pd.read_pickle('../pickle/y_test.pkl')

In [25]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
#     e_val_dict = {}
#     e_val_dict['precision'] = metrics.precision_score(y_true, y_pred)
    print('Evaluation Metrics:')
#     print('Precision: ' + str(e_val_dict))
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score Macro: ' + str(metrics.f1_score(y_true, y_pred, average="macro")))
    print('F1 Score Micro: ' + str(metrics.f1_score(y_true, y_pred, average="micro")))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))
#     return e_val_dict

# Ensamble Models

## Voting Classifier

In [40]:
log_clf = LogisticRegression()
rf = RandomForestClassifier()
svm_clf = SVC()

In [13]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf),('rf', rf),('svm',svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svm', SVC())])

In [18]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test, y_pred))

LogisticRegression 0.8225716928769657
RandomForestClassifier 0.8011100832562442
SVC 0.8268270120259019
VotingClassifier 0.8283071230342276


## Bagging

In [29]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, max_samples=1000,n_jobs=-1, verbose=1)
bag_clf.fit(X_train, y_train)

y_pred_bag = bag_clf.predict(X_test)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:   25.9s remaining:  1.3min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:   26.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.5s remaining:    1.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.9s finished


In [30]:
evaluation(y_test, y_pred_bag)

Evaluation Metrics:
Accuracy: 0.7696577243293247
F1 Score Macro: 0.7693355492010978
F1 Score Micro: 0.7696577243293247
Recall: 0.7065333809353802


## GridsSearchCV

### Random Forest

### Testing with 10 Cross-Validation

In [57]:
param_grid = {
    'bootstrap':[True, False],
    'criterion':['gini','entropy'],
    'n_estimators':[50, 100, 200, 300, 400, 1000],
    'max_depth':[None, list(range(1,100,5))]
    
}

In [46]:
grid_search = GridSearchCV(estimator= rf, param_grid = param_grid,
                          cv=10, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 31.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 74.0min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 78.7min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [None,
                                       [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                        51, 56, 61, 66, 71, 76, 81, 86, 91,
                                        96]],
                         'n_estimators': [50, 100, 200, 300, 400, 1000]},
             verbose=1)

In [48]:
y_pred_grid = grid_search.predict(X_test)
evaluation(y_test, y_pred_grid)

Evaluation Metrics:
Accuracy: 0.811840888066605
F1 Score Macro: 0.8118176986469728
F1 Score Micro: 0.811840888066605
Recall: 0.7725812209925027


In [59]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'n_estimators': 300}

### Testing with 5 Cross-Validation

In [53]:
param_grid = {
    'bootstrap':[True, False],
    'criterion':['gini','entropy'],
    'n_estimators':[50, 100, 200, 300, 400, 1000],
    'max_depth':[None, list(range(1,100,5))]
    
}

In [54]:
grid_search_2 = GridSearchCV(estimator= rf, param_grid = param_grid,
                          cv=5, n_jobs=-1, verbose=1)
grid_search_2.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 32.6min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [None,
                                       [1, 6, 11, 16, 21, 26, 31, 36, 41, 46,
                                        51, 56, 61, 66, 71, 76, 81, 86, 91,
                                        96]],
                         'n_estimators': [50, 100, 200, 300, 400, 1000]},
             verbose=1)

In [56]:
y_pred_grid = grid_search_2.predict(X_test)
evaluation(y_test, y_pred_grid)

Evaluation Metrics:
Accuracy: 0.8109158186864015
F1 Score Macro: 0.8108917319039198
F1 Score Micro: 0.8109158186864015
Recall: 0.7715101749375223


In [60]:
grid_search_2.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'n_estimators': 1000}

## GridSearch With Logistic Regression

In [83]:
log_clf = LogisticRegression()

param_grid_lr = {'penalty': ['l1','l2'],
                 'C': np.logspace(-4, 4, 20),
                 'warm_start':[True, False]      
}

In [92]:
grid_search_lr = GridSearchCV(estimator= log_clf, param_grid = param_grid_lr,
                          cv=10, n_jobs=-1, verbose=1)
grid_search_lr.fit(X_train, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:   37.7s finished


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2'], 'warm_start': [True, False]},
             verbose=1)

In [93]:
y_pred_grid_lr = grid_search_lr.predict(X_test)
evaluation(y_test, y_pred_grid_lr)

Evaluation Metrics:
Accuracy: 0.8246068455134135
F1 Score Macro: 0.8246041978279695
F1 Score Micro: 0.8246068455134135
Recall: 0.7918600499821492


In [94]:
grid_search_lr.best_params_

{'C': 0.615848211066026, 'penalty': 'l2', 'warm_start': True}

## ADABoost

In [101]:
adaboost = AdaBoostClassifier(random_state=1)
adaboost.fit(X_train, y_train)

AdaBoostClassifier(random_state=1)

In [102]:
y_pred_ada = adaboost.predict(X_test)
evaluation(y_test, y_pred_ada)

Evaluation Metrics:
Accuracy: 0.7816836262719704
F1 Score Macro: 0.7814597375227932
F1 Score Micro: 0.7816836262719704
Recall: 0.723313102463406


## Gradient Boosting

In [103]:
gbt_clt = GradientBoostingClassifier(random_state=1)
gbt_clt.fit(X_train, y_train)
y_pred_gbt = gbt_clt.predict(X_test)

In [104]:
evaluation(y_test, y_pred_gbt)

Evaluation Metrics:
Accuracy: 0.7918593894542091
F1 Score Macro: 0.7918501554579169
F1 Score Micro: 0.7918593894542091
Recall: 0.770439128882542


## XGBoost

In [113]:
X_train

<16214x10922 sparse matrix of type '<class 'numpy.float64'>'
	with 485828 stored elements in Compressed Sparse Row format>

In [111]:
# Instantiate XGBClassifier
clf = XGBClassifier()

# Fit XGBClassifier
clf.fit(X_train, y_train)



AttributeError: dlsym(0x7f89f1594490, XGDMatrixSetDenseInfo): symbol not found

## Gradient Boosting