In [1]:
# Basic Packages
import pandas as pd
import numpy as np
import os

# NLP Packages
import nltk 
from nltk.corpus import stopwords
from textblob import TextBlob 
from textblob import Word
import string

# WordCloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Sklearn Packages
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning

# Pandas Settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Solve warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Import pickle
import pickle

In [2]:
df = pd.read_csv('../csv/Hotel_Review_Spell_Checked.csv', index_col=0)

# Preprocessing

In [3]:
stop_words = stopwords.words('english')

In [4]:
df.isna().sum()

Unnamed: 0.1       0
Hotel_Name         0
Negative_Review    0
Positive_Review    0
Reviewer_Score     0
Reviews_Clean      0
Score              0
Spell_Checked      0
dtype: int64

In [5]:
df.dropna(inplace=True)

## Count Vectorizer

In [6]:
# Instantiate CountVectorizer
cv = CountVectorizer(stop_words=stop_words)

# Fit and transform dataframe without data cleaning
df_cv = cv.fit_transform(df['Spell_Checked'])
df_cv = pd.DataFrame(df_cv.toarray(), columns = cv.get_feature_names())
df_cv.index = df.index

In [7]:
# # Using Sparse in the DataFrame
# df_sparse = df_cv.astype('Sparse')

In [8]:
y = df.Score
X_cv = df_cv

In [9]:
# Running Train Test Split
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, y, test_size= 0.25)

## TF-IDF

In [10]:
def fn_tdm_tfidf(docs, xColNames = None, **kwargs):

    #initialize the  vectorizer
    tf = TfidfVectorizer(**kwargs)
    x1 = tf.fit_transform(docs)
    #create dataFrame
    df = pd.DataFrame(x1.toarray().transpose(), index = tf.get_feature_names())

    if xColNames is not None:
        df.columns = xColNames

    return df

In [11]:
df_tfidf = fn_tdm_tfidf(df['Spell_Checked']).transpose()

In [12]:
df_tfidf.drop(columns='zafirovski', inplace=True)

In [13]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(df_tfidf, y, test_size=0.25, random_state=1)

## Evaluation Metric

In [14]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Precision: ' + str(metrics.precision_score(y_true, y_pred)))
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score Weighted: ' + str(metrics.f1_score(y_true, y_pred, average="weighted")))
    print('F1 Score Macro: ' + str(metrics.f1_score(y_true, y_pred, average="macro")))
    print('Recall: ' + str(metrics.recall_score(y_true, y_pred)))

## Modeling With Count Vectorizer

### Logistic Regression

In [15]:
# Baseline Regression Model
lg_base = LogisticRegression(n_jobs=-1)
lg_base.fit(X_train_cv, y_train_cv) 
y_lg_base_cv = lg_base.predict(X_test_cv)

# 28 seconds

KeyboardInterrupt: 

In [None]:
# Logistic Regression baseline evaluation
evaluation(y_test_cv, y_lg_base_cv)

### Random Forest

In [None]:
rf_cv = RandomForestClassifier(n_jobs=-1)
rf_cv.fit(X_train_cv, y_train_cv)
y_rf_cv = rf_cv.predict(X_test_cv)

# 50 seconds

In [None]:
# Random Forest baseline evaluation
evaluation(y_test_cv, y_rf_cv)

### Naive Bayes

In [None]:
nb_base_cv = GaussianNB()
nb_base_cv.fit(X_train_cv, y_train_cv)

y_nb_base_cv = nb_base_cv.predict(X_test_cv)

In [None]:
# Naive Bayes baseline evaluation
evaluation(y_test_cv, y_nb_base_cv)

## Modeling with TF-IDF

### Logistic Regression

In [None]:
# Baseline Regression Model
lg_base_tfidf = LogisticRegression(n_jobs=-1)
lg_base_tfidf.fit(X_train_tfidf, y_train_tfidf) 
y_lg_base_tfidf = lg_base_tfidf.predict(X_test_tfidf)
# 28 seconds

In [None]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_tfidf, y_lg_base_tfidf)

### Random Forest 

In [None]:
rf_base_tfidf = RandomForestClassifier(n_jobs=-1)
rf_base_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_rf_base_tfidf = rf_base_tfidf.predict(X_test_tfidf)

# 50 seconds

In [None]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test_tfidf, y_rf_base_tfidf)

## TF-IDF With Lemmatization

### Create train and test set

In [None]:
X_lem = pickle.load(open('../pickle/X_lem.pkl', 'rb'))
y_lem = pd.read_pickle('../pickle/y_lem.pkl')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_lem, y_lem, test_size=0.20, random_state=15)

In [None]:
# Instantiate TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words= stop_words, ngram_range=(1,2))

# Save Train and Test data
data_train_lem = tfidf.fit_transform(X_train)
data_test_lem = tfidf.transform(X_test)

### Logistic Regression

In [None]:
# Baseline Regression Model
lg_base_tfidf = LogisticRegression(n_jobs=-1)
lg_base_tfidf.fit(data_train_lem, y_train) 
y_lg_base_tfidf_lem = lg_base_tfidf.predict(data_test_lem)
# 28 seconds

In [None]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_lg_base_tfidf_lem)

### Random Forest

In [None]:
rf_base_tfidf = RandomForestClassifier(n_jobs=-1)
rf_base_tfidf.fit(tfidf_data_train, y_train)
y_rf_base_tfidf = rf_base_tfidf.predict(tfidf_data_test)

In [None]:
# Logistic Regression TF-IDF baseline evaluation
evaluation(y_test, y_rf_base_tfidf)

### GridSearch LogReg

In [None]:
#This is the parameters for the baseline Gridsearch with Logistic Regression
param_dict={
    "class_weight":["balanced"],
    "solver":["lbfgs","liblinear"]
}
logreg = LogisticRegression(max_iter=1000)

In [None]:
grid_lg = GridSearchCV(logreg, param_dict, cv=20, scoring='f1_weighted', verbose=1, n_jobs=-1)
grid_lg.fit(X_train_cv,y_train_cv)

In [None]:
y_gc_base_cv = grid_lg(X_test)

In [None]:
# Logistic Regression baseline evaluation
evaluation(y_test_cv, y_gc_base_cv)