# Import Packages

In [31]:
# Basic Packages
import pandas as pd
import numpy as np

# NLP Packages
from nltk.corpus import stopwords

# Sklearn Packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning

import xgboost


import pickle

## Importing Train and Test Set

In [3]:
X_train = pickle.load(open('../pickle/X_train_tfidf.pkl', 'rb'))
X_test = pickle.load(open('../pickle/X_test_tfidf.pkl', 'rb'))
y_train = pd.read_pickle('../pickle/y_train.pkl')
y_test = pd.read_pickle('../pickle/y_test.pkl')

In [25]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
#     e_val_dict = {}
#     e_val_dict['precision'] = metrics.precision_score(y_true, y_pred)
    print('Evaluation Metrics:')
#     print('Precision: ' + str(e_val_dict))
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score Macro: ' + str(metrics.f1_score(y_true, y_pred, average="macro")))
    print('F1 Score Micro: ' + str(metrics.f1_score(y_true, y_pred, average="micro")))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))
#     return e_val_dict

# Ensamble Models

## Voting Classifier

In [40]:
log_clf = LogisticRegression()
rf = RandomForestClassifier()
svm_clf = SVC()

In [13]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf),('rf', rf),('svm',svm_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svm', SVC())])

In [18]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test, y_pred))

LogisticRegression 0.8225716928769657
RandomForestClassifier 0.8011100832562442
SVC 0.8268270120259019
VotingClassifier 0.8283071230342276


## Bagging

In [29]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, max_samples=1000,n_jobs=-1, verbose=1)
bag_clf.fit(X_train, y_train)

y_pred_bag = bag_clf.predict(X_test)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:   25.9s remaining:  1.3min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:   26.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.5s remaining:    1.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.9s finished


In [30]:
evaluation(y_test, y_pred_bag)

Evaluation Metrics:
Accuracy: 0.7696577243293247
F1 Score Macro: 0.7693355492010978
F1 Score Micro: 0.7696577243293247
Recall: 0.7065333809353802


## GridsSearchCV

In [44]:
param_grid = {
    'bootstrap':[True, False],
    'criterion':['gini','entropy'],
    'n_estimators':[50, 100, 200, 300, 400, 1000],
    'max_depth':[None, list(range(1,100,5))]
    
}

In [None]:
grid_search = GridSearchCV(estimator= rf, param_grid = param_grid,
                          cv=10, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


## XGBoost

In [33]:
# xgb_reg = xgboost.XGBRegressor()
# xgb_reg.fit(X_train, y_train)

# y_pred_xgb = xgb_reg.predict(X_test)

## ADABoost

## Gradient Boosting