# Import Packages

In [1]:
# Basic Packages
import pandas as pd
import numpy as np

# NLP Packages
from nltk.corpus import stopwords

# Sklearn Packages
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

import pickle

In [2]:
stop_words = stopwords.words('english')

In [3]:
X_lem = pickle.load(open('../pickle/X_lem.pkl', 'rb'))
y_lem = pd.read_pickle('../pickle/y_lem.pkl')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)

In [5]:
# Instantiate TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))

# Save Train and Test data
data_train_lem = tfidf.fit_transform(X_train)
data_test_lem = tfidf.transform(X_test)

## SMOTE

In [10]:
smote = SMOTE()

In [11]:
X_train_resampled, y_train_resampled = smote.fit_sample(data_train_lem, y_train) 

# Models

In [14]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Precision: ' + str(metrics.precision_score(y_true, y_pred)))
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score Weighted: ' + str(metrics.f1_score(y_true, y_pred, average="weighted")))
    print('F1 Score Macro: ' + str(metrics.f1_score(y_true, y_pred, average="macro")))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))

## Baseline Models with SMOTE

In [16]:
rf_base_tfidf = RandomForestClassifier(random_state=1)
rf_base_tfidf.fit(X_train_resampled, y_train_resampled)
y_rf_base_tfidf = rf_base_tfidf.predict(data_test_lem)

In [17]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_rf_base_tfidf)

Evaluation Metrics:
Precision: 0.8307844104588061
Accuracy: 0.8059666975023126
F1 Score Weighted: 0.8057807885587733
F1 Score Macro: 0.8058245318396061
Recall: 0.7724770642201835
