# Importing

In [75]:
# Basic Packages
import pandas as pd
import numpy as np
import os

# NLP Packages
import nltk 
from nltk.corpus import stopwords

# WordCloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Sklearn Packages
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import resample
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Pandas Settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Solve warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Import pickle
import pickle

## Importing DataFrame

In [76]:
df = pd.read_csv('../csv/Hotel_Review_Spell_Checked.csv', index_col=0)

## Importing Lemmatized X and Y Variables

In [77]:
X_lem = pickle.load(open('../pickle/X_lem.pkl', 'rb'))
y_lem = pd.read_pickle('../pickle/y_lem.pkl')

# Preprocessing

In [78]:
stop_words = stopwords.words('english')

In [79]:
df.dropna(inplace=True)

## Train Test Split

In [80]:
X = df.Spell_Checked
y = df.Score

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

### Train Test Split with Lemmatization

In [82]:
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.25, random_state=15)

## Count Vectorizer

In [83]:
# Instantiate CountVectorizer
cv = CountVectorizer(stop_words=stop_words)

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

## TF-IDF

In [84]:
tfidf = TfidfVectorizer(stop_words)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### TF-DF With Lemmatization

In [85]:
# Save Train and Test data
X_train_lem = tfidf.fit_transform(X_train_lem)
X_test_lem = tfidf.transform(X_test_lem)

## Evaluation Metric

In [123]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    e_val_dict = {}
    e_val_dict['precision'] = metrics.precision_score(y_true, y_pred)
    print('Evaluation Metrics:')
#     print('Precision: ' + str(e_val_dict))
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score Macro: ' + str(metrics.f1_score(y_true, y_pred, average="macro")))
    print('F1 Score Micro: ' + str(metrics.f1_score(y_true, y_pred, average="micro")))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))
#     return e_val_dict

## Modeling With Count Vectorizer

### Logistic Regression

In [108]:
# Baseline Regression Model
lg_base = LogisticRegression()
lg_base.fit(X_train_cv, y_train) 
y_lg_base_cv = lg_base.predict(X_test_cv)

In [126]:
# Logistic Regression baseline evaluation
evaluation(y_test, y_lg_base_cv)

Evaluation Metrics:
Accuracy: 0.798149861239593
F1 Score Macro: 0.7979942808743291
F1 Score Micro: 0.7981498612395929
Recall: 0.7968582649053909


In [134]:
# # creating dictionary with all metrics
# logreg_precision = precision_score(y_test, y_lg_base_cv)
# logreg_recall = recall_score(y_test, y_lg_base_cv)
# logreg_f1_micro = f1_score(y_test, y_lg_base_cv, average='Micro')
# logreg_f1_micro = f1_score(y_test, y_lg_base_cv, average='Macro')

# metric_dict = {}
# metric_dict['Baseline Logisitic Regression'] = {'precision': logreg_precision, 'recall': logreg_recall, 'f1_score': logreg_f1_score, 'weighted_f1': logreg_f1_weighted}

### Random Forest

In [87]:
rf_cv = RandomForestClassifier(n_jobs=-1)
rf_cv.fit(X_train_cv, y_train)
y_rf_cv = rf_cv.predict(X_test_cv)

In [88]:
# Random Forest baseline evaluation
evaluation(y_test, y_rf_cv)

Evaluation Metrics:
Accuracy: 0.8037002775208141
F1 Score Macro: 0.8036860583127245
F1 Score Micro: 0.8037002775208141
Recall: 0.7672259907176009


### Naive Bayes

In [89]:
nb_base_cv = MultinomialNB(alpha = .01)
nb_base_cv.fit(X_train_cv, y_train)

y_nb_base_cv = nb_base_cv.predict(X_test_cv)

In [90]:
# Naive Bayes baseline evaluation
evaluation(y_test, y_nb_base_cv)

Evaluation Metrics:
Accuracy: 0.7811285846438483
F1 Score Macro: 0.7809553525221078
F1 Score Micro: 0.7811285846438483
Recall: 0.7807925740806855


## Modeling with TF-IDF

### Logistic Regression

In [91]:
# Baseline Regression Model
lg_base_tfidf = LogisticRegression()
lg_base_tfidf.fit(X_train_tfidf, y_train) 
y_lg_base_tfidf = lg_base_tfidf.predict(X_test_tfidf)
# 28 seconds

In [92]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_lg_base_tfidf)

Evaluation Metrics:
Accuracy: 0.8225716928769657
F1 Score Macro: 0.8225714742344841
F1 Score Micro: 0.8225716928769659
Recall: 0.7925740806854695


### Random Forest 

In [93]:
rf_base_tfidf = RandomForestClassifier()
rf_base_tfidf.fit(X_train_tfidf, y_train)
y_rf_base_tfidf = rf_base_tfidf.predict(X_test_tfidf)

In [94]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_rf_base_tfidf)

Evaluation Metrics:
Accuracy: 0.7994449583718779
F1 Score Macro: 0.7993905657529674
F1 Score Micro: 0.7994449583718779
Recall: 0.7554444841128168


## TF-IDF With Lemmatization

### Logistic Regression

In [95]:
# Baseline Regression Model
lg_base_tfidf = LogisticRegression(random_state=1)
lg_base_tfidf.fit(X_train_lem, y_train_lem) 
y_lg_base_tfidf_lem = lg_base_tfidf.predict(X_test_lem)
# 28 seconds

In [96]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_lem, y_lg_base_tfidf_lem)

Evaluation Metrics:
Accuracy: 0.8177613320999075
F1 Score Macro: 0.8177523239045533
F1 Score Micro: 0.8177613320999075
Recall: 0.8019765739385066


### Random Forest

In [97]:
rf_base_tfidf = RandomForestClassifier(random_state=1)
rf_base_tfidf.fit(X_train_lem, y_train_lem) 
y_rf_base_tfidf = rf_base_tfidf.predict(X_test_lem)

In [98]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_lem, y_rf_base_tfidf)

Evaluation Metrics:
Accuracy: 0.801295097132285
F1 Score Macro: 0.8012336928608115
F1 Score Micro: 0.801295097132285
Recall: 0.7752562225475842


## SMOTE

### Train Test Split

In [99]:
smote = SMOTE()
X_train_smote_cv, y_train_smote_cv = smote.fit_sample(X_train_cv, y_train) 

smote = SMOTE()
X_train_smote_tfidf, y_train_smote_tfidf = smote.fit_sample(X_train_tfidf, y_train) 

### Logistic Regression - Count Vectorizer

In [100]:
lg_base_smote_cv = LogisticRegression(random_state=1)
lg_base_smote_cv.fit(X_train_smote_cv, y_train_smote_cv)

y_rf_base_smote_cv_smote = lg_base_smote_cv.predict(X_test_cv)

In [101]:
evaluation(y_test, y_rf_base_smote_cv_smote)

Evaluation Metrics:
Accuracy: 0.7994449583718779
F1 Score Macro: 0.7994284740896296
F1 Score Micro: 0.7994449583718779
Recall: 0.7800785433773653


### Random Forest - Count Vectorizer

In [102]:
rf_base_tfidf = RandomForestClassifier(random_state=1)
rf_base_tfidf.fit(X_train_smote_cv, y_train_smote_cv)
y_rf_base_tfidf_smote = rf_base_tfidf.predict(X_test_cv)

In [103]:
evaluation(y_test, y_rf_base_tfidf_smote)

Evaluation Metrics:
Accuracy: 0.8005550416281221
F1 Score Macro: 0.8003552124970327
F1 Score Micro: 0.8005550416281221
Recall: 0.7418779007497323


### Logistic Regression - TF-IDF

In [104]:
# Baseline Regression Model
lg_base_tfidf_smote = LogisticRegression()
lg_base_tfidf_smote.fit(X_train_smote_tfidf, y_train_smote_tfidf) 
y_lg_base_tfidf_smote = lg_base_tfidf_smote.predict(X_test_tfidf)

In [105]:
evaluation(y_test, y_lg_base_tfidf_smote)

Evaluation Metrics:
Accuracy: 0.8227567067530065
F1 Score Macro: 0.822742138540466
F1 Score Micro: 0.8227567067530064
Recall: 0.785076758300607


### Random Forest - TF-IDF

In [106]:
rf_base_tfidf = RandomForestClassifier(random_state=1)
rf_base_tfidf.fit(X_train_smote_tfidf, y_train_smote_tfidf)
y_rf_base_tfidf_smote = rf_base_tfidf.predict(X_test_tfidf)

In [107]:
evaluation(y_test, y_rf_base_tfidf_smote)

Evaluation Metrics:
Accuracy: 0.801295097132285
F1 Score Macro: 0.801167354038615
F1 Score Micro: 0.801295097132285
Recall: 0.7486611924312745


# Pickle Train Test Set

In [118]:
# Pickling Train Set - Features
pickle_out = open("../pickle/X_train_tfidf.pkl",'wb')
pickle.dump(X_train_tfidf, pickle_out)
pickle_out.close()

In [119]:
# Pickling Test Set - Features
pickle_out = open("../pickle/X_test_tfidf.pkl",'wb')
pickle.dump(X_test_tfidf, pickle_out)
pickle_out.close()

In [122]:
# Pickling Train and Test Set = Target
y_train.to_pickle("../pickle/y_train.pkl")
y_test.to_pickle("../pickle/y_test.pkl")

In [None]:
bag_clf = BaggingClassifier(lg_base_tfidf, n_estimators=500,
                             bootstrap=True, n_jobs=-1, verbose=1)
bag_clf.fit(X_train_tfidf, y_train)

In [None]:
y_bag_clf = bag_clf.predict(X_test_tfidf)