# Import Packages

In [83]:
# Importing Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

# NLP Packages
import nltk 
from nltk.corpus import stopwords
from textblob import TextBlob 
from textblob import Word
import re
import string

# WordCloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Sklearn Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, precision_score, f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import ConvergenceWarning

# ImbLearn Packages
from imblearn.over_sampling import SMOTE

# Pandas Settings
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 100)

# Solve warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Initial EDA and Feature Engineering

In [3]:
# Import csv file
df = pd.read_csv('csv/Hotel_Reviews.csv')

In [4]:
# Checking the shape of the dataframe
df.shape

(515738, 17)

In [5]:
# Checking the name of columns
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng'],
      dtype='object')

In [6]:
# Selecting only the columns that I will use
features = ['Hotel_Name', 'Negative_Review','Positive_Review', 'Reviewer_Score']
df = df[features]

In [7]:
# Putting reviews together
df['Reviews'] = df['Negative_Review'] + df['Positive_Review']

In [8]:
# Reducing the size of the dataframe to 20%
df = df.sample(frac=0.2, random_state=1)

In [9]:
# Checking if it worked
df.shape

(103148, 5)

In [10]:
# Checking null values
df.isna().sum()

Hotel_Name         0
Negative_Review    0
Positive_Review    0
Reviewer_Score     0
Reviews            0
dtype: int64

In [11]:
# Checking how many hotels in this dataset
len(df.Hotel_Name.unique())

1488

In [12]:
# Checking the hotel with the highest number of reviews
df.pivot_table(index=['Hotel_Name'], aggfunc='size').nlargest()

Hotel_Name
Britannia International Hotel Canary Wharf           965
Strand Palace Hotel                                  900
Park Plaza Westminster Bridge London                 846
Copthorne Tara Hotel London Kensington               748
DoubleTree by Hilton Hotel London Tower of London    641
dtype: int64

# Data Cleaning

In [14]:
# This function lowercase all the review words, removes punctuation and numbers
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)

    return text

round1 = lambda x: clean_text_round1(x)

In [15]:
# Applying clean_text_round1 function
df['Reviews_Clean'] = pd.DataFrame(df.Reviews.apply(round1))

In [16]:
# Create function that turns the Reviewer Score into a classification target with 3 values
df['Score'] = df['Reviewer_Score'].apply(lambda x: 0 if x < 5 else 1 if x >= 5 and x < 7 else 2)

In [17]:
# Checking if function worked
df[['Reviewer_Score', 'Score']].head(5)

Unnamed: 0,Reviewer_Score,Score
356054,10.0,2
395957,10.0,2
468352,6.7,1
281462,9.6,2
498978,10.0,2


In [None]:
# Checking if there will be class imbalance
df.Score.value_counts()

In [18]:
# Creating separate dataframes depending on the classification
df_Score_0 = df[df.Score == 0]
df_Score_1 = df[df.Score == 1].sample(frac=0.3)
df_Score_2 = df[df.Score == 2].sample(frac=0.05)

In [19]:
# Concatenating th 
df = pd.concat([df_Score_2, df_Score_1, df_Score_0])
df.shape

(12687, 7)

In [20]:
df.Score.value_counts()

0    4498
2    4281
1    3908
Name: Score, dtype: int64

In [21]:
# Saving csv with sentiment analysis
features = ['Hotel_Name', 'Negative_Review', 'Positive_Review', 'Reviewer_Score', 'Reviews_Clean', 'Score']
df = df[features]
df.to_csv("csv/df_no_class_imbalance.csv")

# Tokenizing - Round 1

In [25]:
stop_words = stopwords.words('english')

In [28]:
# Instantiate CountVectorizer
cv = CountVectorizer(stop_words=stop_words)

# Fit and transform dataframe without data cleaning
df_cv = cv.fit_transform(df['Reviews_Clean'])
df_cv = pd.DataFrame(df_cv.toarray(), columns = cv.get_feature_names())
df_cv.index = df.index
df_cv.shape

(12687, 14282)

# Spell Checker

In [65]:
# Checking if function works
df['Spell_Checked'] = df['Reviews_Clean'].apply(lambda x: ''.join(TextBlob(x).correct()))

In [68]:
# Fit and transform dataframe without data cleaning
df_cv = cv.fit_transform(df['Spell_Checked'])
df_cv = pd.DataFrame(df_cv.toarray(), columns = cv.get_feature_names())
df_cv.index = df.index
df_cv.shape

(12687, 10127)

In [70]:
df.to_csv('csv/Hotel_Review_Spell_Checked.csv')

# Lemmatization

In [84]:
df = pd.read_csv('csv/Hotel_Review_Spell_Checked.csv')