# Importing

In [1]:
# Basic Packages
import pandas as pd
import numpy as np
import os

# NLP Packages
import nltk 
from nltk.corpus import stopwords

# WordCloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Sklearn Packages
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import resample
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# Pandas Settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Solve warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Import pickle
import pickle

## Importing DataFrame

In [2]:
df = pd.read_csv('../csv/Hotel_Review_Spell_Checked.csv', index_col=0)

In [3]:
df.head(2)

Unnamed: 0,Unnamed: 0.1,Hotel_Name,Negative_Review,Positive_Review,Reviewer_Score,Reviews_Clean,Score,Spell_Checked
0,185010,St James Court A Taj Hotel London,No Negative,the location was perfect,9.6,no negative the location was perfect,1,no negative the location was perfect
1,424531,H10 Metropolitan 4 Sup,Nothing,Everything was top notch staff were impeccable,10.0,nothing everything was top notch staff were ...,1,nothing everything was top notch staff were ...


## Importing Lemmatized X and Y Variables

In [4]:
X_lem = pickle.load(open('../pickle/X_lem.pkl', 'rb'))
y_lem = pd.read_pickle('../pickle/y_lem.pkl')

# Preprocessing

In [5]:
stop_words = stopwords.words('english')

In [6]:
df.dropna(inplace=True)

## Train Test Split

In [7]:
X = df.Spell_Checked
y = df.Score

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

### Train Test Split with Lemmatization

In [9]:
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.25, random_state=15)

## Count Vectorizer

In [10]:
# Instantiate CountVectorizer
cv = CountVectorizer(stop_words=stop_words)

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

## TF-IDF

In [11]:
tfidf = TfidfVectorizer(stop_words)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### TF-DF With Lemmatization

In [12]:
# Save Train and Test data
X_train_lem = tfidf.fit_transform(X_train_lem)
X_test_lem = tfidf.transform(X_test_lem)

## Evaluation Metric

In [13]:
# Evaluation function

def evaluation(y_true, y_pred):
    
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(metrics.precision_score(y_true, y_pred)))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))
    print('F1 Score: ' + str(metrics.f1_score(y_true, y_pred)))

## Modeling With Count Vectorizer

### Logistic Regression

In [41]:
# Baseline Regression Model
lg_baseline = LogisticRegression()
lg_baseline.fit(X_train_cv, y_train) 
y_lg_base_cv = lg_baseline.predict(X_test_cv)

In [104]:
# Creating dictionary with all metrics
lr_cv_accuracy = accuracy_score(y_test, y_lg_base_cv)
lr_cv_precision = precision_score(y_test, y_lg_base_cv)
lr_cv_recall = recall_score(y_test, y_lg_base_cv)
lr_cv_f1 = f1_score(y_test, y_lg_base_cv)

metric_dict = {}
metric_dict['Baseline Logisitic Regression CV'] = {'Accuracy': lr_cv_accuracy,
                                                'Precision': lr_cv_precision,
                                                'Recall': lr_cv_recall,
                                                'F1 Score': lr_cv_f1 }

In [43]:
# Logistic Regression baseline evaluation
evaluation(y_test, y_lg_base_cv)

Evaluation Metrics:
Accuracy: 0.798149861239593
Precision: 0.8104575163398693
Recall: 0.7968582649053909
F1 Score: 0.8036003600360035


### Random Forest

In [44]:
rf_baseline = RandomForestClassifier(n_jobs=-1)
rf_baseline.fit(X_train_cv, y_train)
y_rf_cv = rf_baseline.predict(X_test_cv)

In [105]:
# Creating dictionary with all metrics
rf_cv_accuracy = accuracy_score(y_test, y_rf_cv)
rf_cv_precision = precision_score(y_test, y_rf_cv)
rf_cv_recall = recall_score(y_test, y_rf_cv)
rf_cv_f1 = f1_score(y_test, y_rf_cv)

metric_dict['Baseline Random Forest CV'] = {'Accuracy': rf_cv_accuracy,
                                                'Precision': rf_cv_precision,
                                                'Recall': rf_cv_recall,
                                                'F1 Score': rf_cv_f1 }

In [50]:
# Random Forest baseline evaluation
evaluation(y_test, y_rf_cv)

Evaluation Metrics:
Accuracy: 0.8059204440333025
Precision: 0.8429913860610807
Recall: 0.7686540521242413
F1 Score: 0.8041083099906629


### Naive Bayes

In [51]:
nb_base_cv = MultinomialNB(alpha = .01)
nb_base_cv.fit(X_train_cv, y_train)

y_nb_base_cv = nb_base_cv.predict(X_test_cv)

In [106]:
# Creating dictionary with all metrics
nb_cv_accuracy = accuracy_score(y_test, y_nb_base_cv)
nb_cv_precision = precision_score(y_test, y_nb_base_cv)
nb_cv_recall = recall_score(y_test, y_nb_base_cv)
nb_cv_f1 = f1_score(y_test, y_lg_base_cv)

metric_dict['Baseline Naive Bayes CV'] = {'Accuracy': nb_cv_accuracy,
                                                'Precision': nb_cv_precision,
                                                'Recall': nb_cv_recall,
                                                'F1 Score': nb_cv_f1 }

In [53]:
# Naive Bayes baseline evaluation
evaluation(y_test, y_nb_base_cv)

Evaluation Metrics:
Accuracy: 0.7811285846438483
Precision: 0.7935413642960812
Recall: 0.7807925740806855
F1 Score: 0.7871153500089977


### SVC

In [24]:
svc = SVC(kernel='linear')
svc.fit(X_train_cv, y_train)
y_pred_svc = svc.predict(X_test_cv)

In [54]:
# Creating dictionary with all metrics
svc_cv_accuracy = accuracy_score(y_test, y_pred_svc)
svc_cv_precision = precision_score(y_test, y_pred_svc)
svc_cv_recall = recall_score(y_test, y_pred_svc)
svc_cv_f1 = f1_score(y_test, y_pred_svc)

metric_dict['Baseline SVC CV'] = {'Accuracy': svc_cv_accuracy,
                                  'Precision': svc_cv_precision,
                                  'Recall': svc_cv_recall,
                                  'F1 Score': svc_cv_f1 }

In [55]:
evaluation(y_test, y_pred_svc)

Evaluation Metrics:
Accuracy: 0.7839037927844589
Precision: 0.7942342342342342
Recall: 0.7868618350589075
F1 Score: 0.7905308464849354


In [61]:
pd.DataFrame.from_dict(metric_dict, orient='index')

Unnamed: 0,accuracy,precision,recall,f1_score
Baseline Logisitic Regression CV,0.79815,0.810458,0.796858,0.8036
Baseline Random Forest CV,0.80592,0.842991,0.768654,0.804108
Baseline Naive Bayes CV,0.781129,0.793541,0.780793,0.8036
Baseline SVC CV,0.783904,0.794234,0.786862,0.790531
Baseline Logistic Regression TF-IDF,0.822572,0.854503,0.792574,0.822375
Baseline Random Forest TF-IDF,0.803515,0.848218,0.756159,0.799547


## Modeling with TF-IDF

### Logistic Regression

In [28]:
# Baseline Regression Model
lg_baseline = LogisticRegression()
lg_baseline.fit(X_train_tfidf, y_train) 
y_lg_base_tfidf = lg_baseline.predict(X_test_tfidf)

In [57]:
# Creating dictionary with all metrics
lg_tfidf_accuracy = accuracy_score(y_test, y_lg_base_tfidf)
lg_tfidf_precision = precision_score(y_test, y_lg_base_tfidf)
lg_tfidf_recall = recall_score(y_test, y_lg_base_tfidf)
lg_tfidf_f1 = f1_score(y_test, y_lg_base_tfidf)

metric_dict['Baseline Logistic Regression TF-IDF'] = {'accuracy': lg_tfidf_accuracy,
                                                'precision': lg_tfidf_precision,
                                                'recall': lg_tfidf_recall,
                                                'f1_score': lg_tfidf_f1 }

In [58]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_lg_base_tfidf)

Evaluation Metrics:
Accuracy: 0.8225716928769657
Precision: 0.8545034642032333
Recall: 0.7925740806854695
F1 Score: 0.8223745137988516


### Random Forest 

In [88]:
rf_baseline = RandomForestClassifier()
rf_baseline.fit(X_train_tfidf, y_train)
y_rf_base_tfidf = rf_baseline.predict(X_test_tfidf)

In [90]:
# Creating dictionary with all metrics
rf_tfidf_accuracy = accuracy_score(y_test, y_rf_base_tfidf)
rf_tfidf_precision = precision_score(y_test, y_rf_base_tfidf)
rf_tfidf_recall = recall_score(y_test, y_rf_base_tfidf)
rf_tfidf_f1 = f1_score(y_test, y_rf_base_tfidf)

metric_dict['Baseline Random Forest TF-IDF'] = {'accuracy': rf_tfidf_accuracy,
                                                'precision': rf_tfidf_precision,
                                                'recall': rf_tfidf_recall,
                                                'f1_score': rf_tfidf_f1 }

In [91]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_rf_base_tfidf)

Evaluation Metrics:
Accuracy: 0.8016651248843664
Precision: 0.8437375745526839
Recall: 0.7575865762227776
F1 Score: 0.798344620015049


### SVC - Kernel = 'linear'

In [68]:
svc = SVC(kernel='rbf')
svc.fit(X_train_tfidf, y_train)
y_pred_svc_tfidf = svc.predict(X_test_tfidf)

In [69]:
# Creating dictionary with all metrics
svc_tfidf_accuracy = accuracy_score(y_test, y_pred_svc_tfidf)
svc_tfidf_precision = precision_score(y_test, y_pred_svc_tfidf)
svc_tfidf_recall = recall_score(y_test, y_pred_svc_tfidf)
svc_tfidf_f1 = f1_score(y_test, y_pred_svc_tfidf)

metric_dict['Baseline SVC TF-IDF'] = {'accuracy': svc_tfidf_accuracy,
                                                'precision': svc_tfidf_precision,
                                                'recall': svc_tfidf_recall,
                                                'f1_score': svc_tfidf_f1 }

In [70]:
evaluation(y_test, y_pred_svc_tfidf)

Evaluation Metrics:
Accuracy: 0.8268270120259019
Precision: 0.8672705789680977
Recall: 0.7861478043555873
F1 Score: 0.8247191011235955


In [71]:
pd.DataFrame.from_dict(metric_dict, orient='index')

Unnamed: 0,accuracy,precision,recall,f1_score
Baseline Logisitic Regression CV,0.79815,0.810458,0.796858,0.8036
Baseline Random Forest CV,0.80592,0.842991,0.768654,0.804108
Baseline Naive Bayes CV,0.781129,0.793541,0.780793,0.8036
Baseline SVC CV,0.783904,0.794234,0.786862,0.790531
Baseline Logistic Regression TF-IDF,0.822572,0.854503,0.792574,0.822375
Baseline Random Forest TF-IDF,0.803515,0.848218,0.756159,0.799547
Baseline SVC TF-IDF,0.826827,0.867271,0.786148,0.824719


## TF-IDF With Lemmatization

### Logistic Regression

In [74]:
# Baseline Regression Model
rf_baseline = LogisticRegression(random_state=1)
rf_baseline.fit(X_train_lem, y_train_lem) 
y_lg_base_tfidf_lem = rf_baseline.predict(X_test_lem)

In [94]:
# Creating dictionary with all metrics
lr_lem_accuracy = accuracy_score(y_test_lem, y_lg_base_tfidf_lem)
lr_lem_precision = precision_score(y_test_lem, y_lg_base_tfidf_lem)
lr_lem_recall = recall_score(y_test_lem, y_lg_base_tfidf_lem)
lr_lem_f1 = f1_score(y_test_lem, y_lg_base_tfidf_lem)

metric_dict['Baseline Logistic Regression Lem'] = {'accuracy': lr_lem_accuracy,
                                                'precision': lr_lem_precision,
                                                'recall': lr_lem_recall,
                                                'f1_score': lr_lem_f1 }

In [95]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_lem, y_lg_base_tfidf_lem)

Evaluation Metrics:
Accuracy: 0.8177613320999075
Precision: 0.8314990512333966
Recall: 0.8019765739385066
F1 Score: 0.8164710266443078


### Random Forest

In [96]:
rf_baseline = RandomForestClassifier(random_state=1)
rf_baseline.fit(X_train_lem, y_train_lem) 
y_rf_base_lem = rf_baseline.predict(X_test_lem)

In [97]:
# Creating dictionary with all metrics
rf_lem_accuracy = accuracy_score(y_test_lem, y_rf_base_lem)
rf_lem_precision = precision_score(y_test_lem, y_rf_base_lem)
rf_lem_recall = recall_score(y_test_lem, y_rf_base_lem)
rf_lem_f1 = f1_score(y_test_lem, y_rf_base_lem)

metric_dict['Baseline Random Forest Lem'] = {'accuracy': rf_lem_accuracy,
                                                'precision': rf_lem_precision,
                                                'recall': rf_lem_recall,
                                                'f1_score': rf_lem_f1 }

In [98]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_lem, y_rf_base_tfidf)

Evaluation Metrics:
Accuracy: 0.5017576318223866
Precision: 0.5077534791252485
Recall: 0.46742313323572476
F1 Score: 0.4867543358109396


### SVC

In [101]:
svc = SVC(kernel='linear')
svc.fit(X_train_lem, y_train_lem)
y_pred_svc_lem = svc.predict(X_test_lem)

In [102]:
# Creating dictionary with all metrics
svc_lem_accuracy = accuracy_score(y_test_lem, y_pred_svc_lem)
svc_lem_precision = precision_score(y_test_lem, y_pred_svc_lem)
svc_lem_recall = recall_score(y_test_lem, y_pred_svc_lem)
svc_lem_f1 = f1_score(y_test_lem, y_pred_svc_lem)

metric_dict['Baseline Random Forest Lem'] = {'accuracy': svc_lem_accuracy,
                                                'precision': svc_lem_precision,
                                                'recall': svc_lem_recall,
                                                'f1_score': svc_lem_f1 }

In [103]:
evaluation(y_test_lem, y_pred_svc_lem)

Evaluation Metrics:
Accuracy: 0.8144310823311748
Precision: 0.8303400840657241
Recall: 0.7953879941434846
F1 Score: 0.8124883155730044


In [99]:
pd.DataFrame.from_dict(metric_dict, orient='index')

Unnamed: 0,accuracy,precision,recall,f1_score
Baseline Logisitic Regression CV,0.79815,0.810458,0.796858,0.8036
Baseline Random Forest CV,0.80592,0.842991,0.768654,0.804108
Baseline Naive Bayes CV,0.781129,0.793541,0.780793,0.8036
Baseline SVC CV,0.783904,0.794234,0.786862,0.790531
Baseline Logistic Regression TF-IDF,0.822572,0.854503,0.792574,0.822375
Baseline Random Forest TF-IDF,0.801665,0.843738,0.757587,0.798345
Baseline SVC TF-IDF,0.826827,0.867271,0.786148,0.824719
Baseline Random Forest Lem,0.801295,0.821567,0.775256,0.79774
Baseline Logistic Regression Lem,0.817761,0.831499,0.801977,0.816471


## SMOTE

### Train Test Split

In [99]:
smote = SMOTE()
X_train_smote_cv, y_train_smote_cv = smote.fit_sample(X_train_cv, y_train) 

smote = SMOTE()
X_train_smote_tfidf, y_train_smote_tfidf = smote.fit_sample(X_train_tfidf, y_train) 

### Logistic Regression - Count Vectorizer

In [100]:
lg_base_smote_cv = LogisticRegression(random_state=1)
lg_base_smote_cv.fit(X_train_smote_cv, y_train_smote_cv)

y_rf_base_smote_cv_smote = lg_base_smote_cv.predict(X_test_cv)

In [101]:
evaluation(y_test, y_rf_base_smote_cv_smote)

Evaluation Metrics:
Accuracy: 0.7994449583718779
F1 Score Macro: 0.7994284740896296
F1 Score Micro: 0.7994449583718779
Recall: 0.7800785433773653


### Random Forest - Count Vectorizer

In [102]:
rf_base_tfidf = RandomForestClassifier(random_state=1)
rf_base_tfidf.fit(X_train_smote_cv, y_train_smote_cv)
y_rf_base_tfidf_smote = rf_base_tfidf.predict(X_test_cv)

In [103]:
evaluation(y_test, y_rf_base_tfidf_smote)

Evaluation Metrics:
Accuracy: 0.8005550416281221
F1 Score Macro: 0.8003552124970327
F1 Score Micro: 0.8005550416281221
Recall: 0.7418779007497323


### Logistic Regression - TF-IDF

In [104]:
# Baseline Regression Model
lg_base_tfidf_smote = LogisticRegression()
lg_base_tfidf_smote.fit(X_train_smote_tfidf, y_train_smote_tfidf) 
y_lg_base_tfidf_smote = lg_base_tfidf_smote.predict(X_test_tfidf)

In [105]:
evaluation(y_test, y_lg_base_tfidf_smote)

Evaluation Metrics:
Accuracy: 0.8227567067530065
F1 Score Macro: 0.822742138540466
F1 Score Micro: 0.8227567067530064
Recall: 0.785076758300607


### Random Forest - TF-IDF

In [106]:
rf_base_tfidf = RandomForestClassifier(random_state=1)
rf_base_tfidf.fit(X_train_smote_tfidf, y_train_smote_tfidf)
y_rf_base_tfidf_smote = rf_base_tfidf.predict(X_test_tfidf)

In [107]:
evaluation(y_test, y_rf_base_tfidf_smote)

Evaluation Metrics:
Accuracy: 0.801295097132285
F1 Score Macro: 0.801167354038615
F1 Score Micro: 0.801295097132285
Recall: 0.7486611924312745


# Pickle Train Test Set

In [118]:
# Pickling Train Set - Features
pickle_out = open("../pickle/X_train_tfidf.pkl",'wb')
pickle.dump(X_train_tfidf, pickle_out)
pickle_out.close()

In [119]:
# Pickling Test Set - Features
pickle_out = open("../pickle/X_test_tfidf.pkl",'wb')
pickle.dump(X_test_tfidf, pickle_out)
pickle_out.close()

In [122]:
# Pickling Train and Test Set = Target
y_train.to_pickle("../pickle/y_train.pkl")
y_test.to_pickle("../pickle/y_test.pkl")

In [None]:
bag_clf = BaggingClassifier(lg_base_tfidf, n_estimators=500,
                             bootstrap=True, n_jobs=-1, verbose=1)
bag_clf.fit(X_train_tfidf, y_train)

In [None]:
y_bag_clf = bag_clf.predict(X_test_tfidf)