In [None]:
import ast
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Fetching data

In [None]:
data = pd.read_csv('../data/Hotel_Reviews.csv') #read the csv file

## Data preprocessing

In [None]:
# This is the basic data cleaning process. This should be added in all the models
# This cell is a bit slow to run, so it is intended to be run only once

# This part drops the columns that are not needed in the model
data.drop('Hotel_Address',axis=1,inplace=True)  # Drop the column 'Hotel_Address' due to lack of relevance
data.drop('Review_Date',axis=1,inplace=True)    # Drop the column 'Review_Date' due to lack of complete data
data.drop('Additional_Number_of_Scoring',axis=1,inplace=True)  # Drop the column 'Additional_Number_of_Scoring' due to not knowing what the numbers mean
data.drop('lat',axis=1,inplace=True)    # Drop the column 'lat' due to lack of relevance
data.drop('lng',axis=1,inplace=True)    # Drop the column 'lng' due to lack of relevance
data.drop('Total_Number_of_Reviews',axis=1,inplace=True)    # Drop the column 'Total_Number_of_Reviews' due to the number seams to be incorrect

# The next part of the code aims to split the 'Tags' column into multiple columns
data['Tags'] = data['Tags'].apply(ast.literal_eval) # Convert the string to a list, This code is made by chatgpt

# The following 3 lines of code is made by github copilot
tags_expanded = data['Tags'].apply(pd.Series) # Expand the 'Tags' column into multiple column
tags_expanded.columns = [f'Tag_{i}' for i in range(tags_expanded.shape[1])] # Rename the columns for better readability

data = pd.concat([data, tags_expanded], axis=1) # Concatenate the expanded tags with the original dataframe
data.drop('Tags', axis=1, inplace=True)   # Drop the column 'Tags' due to the data being split into multiple columns

# The next part of the code turns the 'days_since_review' and 'Tag_3' columns into integers
data['days_since_review'] = data['days_since_review'].str.extract('(\d+)').astype(int) # Extract the number from the string. This code if made using chatgpt
data['Tag_3'] = data['Tag_3'].str.extract('(\d+)').astype(float) # Extract the number from the string and convert to float. For some reason it did not work as int

# The next part of the code creates a new dataframe with hotel names and removes it from the dataframe data
hotel_names = data['Hotel_Name'] # Create a new dataframe with the hotel names
data.drop('Hotel_Name',axis=1,inplace=True) # Drop the column 'Hotel_Name' as it is not needed in the model


## Model 4: TF-IDF
The fourth and final model is based on feature extraction using the TF-IDF metric and modelling with logistic regression.

### TF-IDF
*Term frequency-inverse document frequency* is a measure of how unique a term is to a document in a corpus. This is calculated by finding the product of two measures, namely term frequency and inverse document frequency.

Term frequency is the number of times a term $t$ appears in a document $d$ divided by the total number of terms (including repeated terms). This can be denoted by the following function:

$$tf(t,d)=\frac{f_{t,d}}{\sum_{t'\in d} f_{t',d}}$$

As an example we look at the following 'document': *"To be or not to be, that is the question."* We call that document $d$. Using the function above we find the following frequencies:

$$tf(\mathrm{"be"}, d)=\frac{2}{10}=0.2$$
$$tf(\mathrm{"is"}, d)=\frac{1}{10}=0.1$$
$$tf(\mathrm{"question"}, d)=\frac{1}{10}=0.1$$

From these results one could conclude that the words "is" and "question" bear the same importance in the document. Most would however say that "question" is a more defining word as it is more unique. This is where inverse term frequency comes in.

Inverse term frequency takes in to account the entire corpus $D$. It counts the number of documents in the corpus and divides it by the number of documents which contain the term $t$ and then takes the logarithm. This is denoted by the following function:

$$idf(t,D)=\log{\frac{|D|}{|\{d:d\in D \,\wedge \,t\in d\}|}}$$

This leads to less common terms getting a higher $idf$. This lets us calculate the TF-IDF metric of a term $t$ in a document $d$ of a corpus $D$:

$$tfidf(t,d,D)=tf(t,d)\cdot idf(t,D)$$

To set the dataset up for TF-IDF we do the following:

In [None]:
tfidf_data = data.copy() # Take a copy of the data

reviews = tfidf_data["Negative_Review"] # Extract the negative reviews.

tfidf_vectorizer = TfidfVectorizer(stop_words="english") # Set up the vectorizer class
# NOTE: stop_words filters out common short words in english, e.g 'is, 'the', etc.

X = tfidf_vectorizer.fit_transform(reviews) # Vectorize all reviews and assign as input variable.
y = data["Reviewer_Score"].map(lambda x: 0 if x < 5.0 else 1) # Generate correpsponding output.

print(X.get_feature_names_out())
print(X.head())

The *TfidfVectorizer* creates a set of all words contained in all of the reviews. It then calculates a TF-IDF score for each unique word for each document in the corpus. This should allow us to create a model which can correlate the combination of frequencies of certain words to either a good or bad sentiment. 

As the result can either be 'good' or 'bad', we could assign a probability to each of them which add up to 100%. A model based on logistic regresison would correspond to this scenario. We set it up using the built in class from Scikit.

In [None]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up Logistic Regression model.
model = LogisticRegression

# Set up class weights. See previous explanation.
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=y_train)
class_weights_dict = dict(zip([0, 1], class_weights))

# Fit data to model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dict)

y_pred = model.predict(X_test)

In [None]:
# This cell is for evaluating the model
test_loss, test_accuracy, test_auc = model.evaluate(X_test, y_test)
train_loss, train_accuracy, train_auc = model.evaluate(X_train, y_train)
print(f'Test accuracy: {test_accuracy:.2f}')
print(f'Train accuracy: {train_accuracy:.2f}')
print(f'Overfitting: {train_accuracy-test_accuracy:.2f}')

# Convert predicted probabilities to binary classification (positive or negative review)
y_pred = model.predict(X_test)  

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob) 

# Results
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}') # How many predicted positives are actually positive
print(f'Recall: {recall:.2f}') # How many actual positive cases were correctly predicted
print(f'F1-Score: {f1:.2f}') # Balance betweet precision and recall
print(f'ROC-AUC: {roc_auc:.2f}') # Area under ROC curve (recall against false positive rate)
