In [None]:
# Importing Packages
import pandas as pd
import numpy as np
import os

# NLP Packages
import nltk 
from nltk.corpus import stopwords
from textblob import TextBlob 
from textblob import Word
import re
import string

# WordCloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Sklearn Packages
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.exceptions import ConvergenceWarning

# ImbLearn Packages
from imblearn.over_sampling import SMOTE

# Pandas Settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Solve warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
df = pd.read_csv('../csv/Hotel_Review_Spell_Checked.csv', index_col=0)

In [None]:
df.head()

# Preprocessing

In [None]:
stop_words = stopwords.words('english')

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df[df['Score'] == 0]

## Count Vectorizer

In [None]:
# Instantiate CountVectorizer
cv = CountVectorizer(stop_words=stop_words)

# Fit and transform dataframe without data cleaning
df_cv = cv.fit_transform(df['Spell_Checked'])
df_cv = pd.DataFrame(df_cv.toarray(), columns = cv.get_feature_names())
df_cv.index = df.index

In [None]:
df_cv.head()

In [None]:
# # Using Sparse in the DataFrame
# df_sparse = df_cv.astype('Sparse')

In [None]:
y = df.Score
X_cv = df_cv

In [None]:
# Running Train Test Split
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y, test_size= 0.25)

## TF-IDF

In [None]:
def fn_tdm_tfidf(docs, xColNames = None, **kwargs):
    ''' create a term document matrix as pandas DataFrame
    with **kwargs you can pass arguments of CountVectorizer
    if xColNames is given the dataframe gets columns Names'''

    #initialize the  vectorizer
    tf = TfidfVectorizer(**kwargs)
    x1 = tf.fit_transform(docs)
    #create dataFrame
    df = pd.DataFrame(x1.toarray().transpose(), index = tf.get_feature_names())

    if xColNames is not None:
        df.columns = xColNames

    return df

In [None]:
df_tfidf = fn_tdm_tfidf(df['Spell_Checked']).transpose()

In [None]:
df_tfidf.head()

In [None]:
df_tfidf.drop(columns='zafirovski', inplace=True)

In [None]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(df_tfidf, y, test_size=0.25, random_state=1)

## Evaluation Metric

In [None]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score Weighted: ' + str(metrics.f1_score(y_true, y_pred, average="weighted")))
    print('F1 Score Macro: ' + str(metrics.f1_score(y_true, y_pred, average="macro")))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))

## Modeling With Count Vectorizer

### Logistic Regression

In [25]:
# Baseline Regression Model
lg_base = LogisticRegression()
lg_base.fit(X_train_cv, y_train_cv) 
y_lg_base_cv = lg_base.predict(X_test_cv)

# 28 seconds

In [26]:
# Logistic Regression baseline evaluation
evaluation(y_test_cv, y_lg_base_cv)

Evaluation Metrics:
Accuracy: 0.8108957569408067
F1 Score Weighted: 0.8108312400797062
F1 Score Macro: 0.8060086047124368
Recall: 0.8108957569408067


In [27]:
rf_recall = recall_score(y_test_cv, y_lg_base_cv, average='weighted')
rf_f1_score = f1_score(y_test_cv, y_lg_base_cv, average='weighted')

### Grid Search LogReg

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
#This is the parameters for the baseline Gridsearch with Logistic Regression
param_dict={
    "class_weight":["balanced"],
    "solver":["lbfgs","liblinear"]
    
}
logreg = LogisticRegression(max_iter=1000)

In [None]:
grid_lg = GridSearchCV(logreg, param_dict, cv=20, scoring='f1_weighted', verbose=1, n_jobs=-1)
grid_lg.fit(X_train_cv,y_train_cv)

In [None]:
y_gc_base_cv = grid_lg(X_test)

In [None]:
# Logistic Regression baseline evaluation
evaluation(y_test_cv, y_gc_base_cv)

### Random Forest

In [32]:
rf_cv = RandomForestClassifier()
rf_cv.fit(X_train_cv, y_train_cv)
y_rf_cv = rf_cv.predict(X_test_cv)

# 50 seconds

In [33]:
# Random Forest baseline evaluation
evaluation(y_test_cv, y_rf_cv)

Evaluation Metrics:
Accuracy: 0.8078837087480356
F1 Score Weighted: 0.8079038186480665
F1 Score Macro: 0.8030935305606792
Recall: 0.8078837087480356


### Naive Bayes

In [35]:
nb_base_cv = GaussianNB()
nb_base_cv.fit(X_train_cv, y_train_cv)

y_nb_base_cv = nb_base_cv.predict(X_test_cv)

In [36]:
# Naive Bayes baseline evaluation
evaluation(y_test_cv, y_nb_base_cv)

Evaluation Metrics:
Accuracy: 0.6304347826086957
F1 Score Weighted: 0.5803822053361309
F1 Score Macro: 0.5507471150301966
Recall: 0.6304347826086957


In [None]:
# knn_base_cv = KNeighborsClassifier()
# knn_base_cv.fit(X_train_cv, y_train_cv)

# y_knn_base_cv = knn_base_cv.predict(X_train_cv)

In [None]:
# # KNN baseline evaluation
# evaluation(y_test_cv, y_knn_base_cv)

## Modeling with TF-IDF

### Logistic Regression

In [None]:
# Baseline Regression Model
lg_base_tfidf = LogisticRegression()
lg_base_tfidf.fit(X_train_tfidf, y_train_tfidf) 
y_lg_base_tfidf = lg_base_tfidf.predict(X_test_tfidf)
# 28 seconds

In [None]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_tfidf, y_lg_base_tfidf)

### Random Forest 

In [None]:
rf_base_tfidf = RandomForestClassifier()
rf_base_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_rf_base_tfidf = rf_base_tfidf.predict(X_test_tfidf)

# 50 seconds

In [None]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_tfidf, y_rf_base_tfidf)