In [None]:
# Importing the needed packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, make_scorer, confusion_matrix
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from scipy.stats import uniform, randint

In [None]:
# link to the dataset used for this thesis
# https://www.kaggle.com/datasets/dkapitan/dutch-restaurant-reviews

In [None]:
# Read dataset and transfer it to a dataframe
file_path = filepath # add the path to the file here
df_raw_data = pd.read_parquet(file_path) # Load the data

In [None]:
# Information of the raw dataset
print(df_raw_data.head(5)) # Print the first five rows of the data
print(len(df_raw_data)) # Print the number of rows of the dataframe

In [None]:
# Necessary preprocessing steps for thesis
df_pp_data = df_raw_data.copy()

# Remove the unknown character in the avgPrice column
df_pp_data["avgPrice"] = df_pp_data["avgPrice"].str.replace("â\u0082¬","")
df_pp_data["avgPrice"] = df_pp_data["avgPrice"].str.replace("\u0080","")

# Convert the columns containing numeric information to numeric
columns_to_convert = ["scoreTotal", "avgPrice", "reviewerNumReviews", "reviewScoreOverall", "reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
df_pp_data[columns_to_convert] = df_pp_data[columns_to_convert].apply(pd.to_numeric, errors = "coerce") 


In [None]:
# Useful information per variable

unique_restaurants = df_pp_data["restoName"].nunique() # number of unique restaurants
unique_reviewers = df_pp_data["reviewerId"].nunique() # number of unique reviewers

# information about numeric values to check if they contain impossible or implausible values

for column in df_pp_data.columns:
    if pd.api.types.is_numeric_dtype(df_pp_data[column]):
        avg = df_pp_data[column].mean()
        min_val = df_pp_data[column].min()
        max_val = df_pp_data[column].max()
        print(f"Variable: {column}, Average: {avg}, Minimum: {min_val}, Maximum: {max_val}")

print("Total number of restaurants:", unique_restaurants)
print("Total number of reviewers:", unique_reviewers)

In [None]:
# Remove all reviews with a score of 5 and 6 to exclude neutral sentiments from the data
df_pp_data = df_pp_data[~((df_pp_data["reviewScoreFood"] == 5) | (df_pp_data["reviewScoreFood"] == 6) |
                         (df_pp_data["reviewScoreService"] == 5) | (df_pp_data["reviewScoreService"] == 6) |
                           (df_pp_data["reviewScoreAmbiance"] == 5) | (df_pp_data["reviewScoreAmbiance"] == 6))]
print(len(df_pp_data))

In [None]:
# The average price of restaurants
average_prices = df_pp_data["avgPrice"].dropna() 

# Plot a histogram containing the distribution of the prica variable
plt.figure(figsize=(10, 6))  
plt.hist(average_prices, bins=30, color='blue', alpha=0.7)
plt.xlabel('Average Price', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.tick_params(axis='both', labelsize=12)
plt.show()

In [None]:
# change the personal review scores to sentiments

columns_to_change = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
bins = [1,6,10] # split the reviews into positive and negative sentiments
labels = ["Negative", "Positive"]
for column in columns_to_change:
    df_pp_data[column] = pd.cut(df_pp_data[column], bins = bins, labels = labels)


In [None]:
# extract the city name from the address
# Function to extract the second to last element from the address
def extract_second_to_last_element(address):
    address_components = address.split()
    if len(address_components) >= 2:
        return address_components[-2]
    else:
        return None

# create a new feature that contains only the name of the city
df_pp_data["City"] = df_pp_data["address"].apply(extract_second_to_last_element)

unique_values_city = df_pp_data["City"].value_counts()
print(unique_values_city)

In [None]:
# remove rows that contain missing values for the important variables
# important variables: restoId, avgPrice, reviewScoreOverall, reviewScoreFood, reviewScoreService, reviewScoreAmbiance, reviewText, City
df_nomissing = df_pp_data.copy()

important_features = ["restoId", "avgPrice", "reviewScoreOverall", "reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance", "reviewText", "City"]

df_nomissing = df_nomissing.dropna(subset = important_features)

print(len(df_pp_data))
print(len(df_nomissing)) # print the new length of the dataframe to check how many rows remain in the new dataframe

In [None]:
# make a new variable that contains the length of a review
df_nomissing["reviewLength"] = df_nomissing["reviewText"].apply(len)

descriptives_length = df_nomissing["reviewLength"].describe()
print(descriptives_length)

In [None]:
# visualize the review length variable before removing reviews based on their length
plt.hist(df_nomissing["reviewLength"], bins = 100, color = "grey", edgecolor = "black")
plt.xlabel("Length")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Remove reviews that are too long and too short
# Reviews with less than 20 characters and more than 2000 characters will be removed
df_lessreviews = df_nomissing.copy()
df_lessreviews = df_lessreviews[(df_lessreviews["reviewLength"] >= 20) & (df_lessreviews["reviewLength"] <= 2000)]

print(len(df_lessreviews))

In [None]:
# visualize the review length variable after removing long and short reviews
plt.hist(df_lessreviews["reviewLength"], bins = 100, color = "grey", edgecolor = "black")
plt.xlabel("Length")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Analyze how many positive and negative reviews exist in the dataset for each label
counts_food = df_lessreviews["reviewScoreFood"].value_counts()
counts_service = df_lessreviews["reviewScoreService"].value_counts()
counts_ambiance = df_lessreviews["reviewScoreAmbiance"].value_counts()

print(counts_food)
print(counts_service)
print(counts_ambiance)

In [None]:
# visualize the distributions of sentiments per aspect

counts_sentiments = pd.DataFrame({
    "Food": counts_food,
    "Service": counts_service,
    "Ambiance": counts_ambiance
}).fillna(0)

counts_sentiments = counts_sentiments.T

fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.35

positions = range(len(counts_sentiments))

positive_bars = plt.bar(positions, counts_sentiments["Positive"], bar_width, label = "Positive", color = "lightblue")
negative_bars = plt.bar([p + bar_width for p in positions], counts_sentiments["Negative"], bar_width, label = "Negative", color = "lightcoral")


ax.set_xticks([p + bar_width / 2 for p in positions])
ax.set_xticklabels(counts_sentiments.index, fontsize=12)
plt.xlabel('Aspects', fontsize=14)
plt.ylabel('Number of instances', fontsize=14)
plt.legend(fontsize=12)
plt.show()

In [None]:
# Randomly remove rows with all positive labels to make the dataset more balanced (is still imbalanced after for some aspects, but less imbalanced),
# also remove rows because the amount of data is computationally expensive

sentiment_columns = ["reviewScoreFood", "reviewScoreService", "reviewScoreAmbiance"]
df_sentiment = df_lessreviews[sentiment_columns]

positive_rows = np.all(df_sentiment == "Positive", axis = 1)

sampled_positive_rows = df_lessreviews[positive_rows].sample(n = 232830, random_state = 68)

df_lessreviews = df_lessreviews.drop(sampled_positive_rows.index)

print(len(df_lessreviews))



In [None]:
# analyze how many positive and negative reviews are left in the reduced dataframe
counts_food = df_lessreviews["reviewScoreFood"].value_counts()
counts_service = df_lessreviews["reviewScoreService"].value_counts()
counts_ambiance = df_lessreviews["reviewScoreAmbiance"].value_counts()

print(counts_food)
print(counts_service)
print(counts_ambiance)

In [None]:
# visualize the distributions of sentiments per aspect

counts_sentiments = pd.DataFrame({
    "Food": counts_food,
    "Service": counts_service,
    "Ambiance": counts_ambiance
}).fillna(0)

counts_sentiments = counts_sentiments.T

fig, ax = plt.subplots(figsize=(10, 6))

bar_width = 0.35

positions = range(len(counts_sentiments))

positive_bars = plt.bar(positions, counts_sentiments["Positive"], bar_width, label = "Positive", color = "lightblue")
negative_bars = plt.bar([p + bar_width for p in positions], counts_sentiments["Negative"], bar_width, label = "Negative", color = "lightcoral")

ax.set_xticks([p + bar_width / 2 for p in positions])
ax.set_xticklabels(counts_sentiments.index, fontsize=12)
plt.xlabel('Aspects', fontsize=14)
plt.ylabel('Number of instances', fontsize=14)
plt.legend(fontsize=12)
plt.show()

In [None]:
# Encode the sentiments into labels (negative = 0, positive = 1)

class_mapping = {"Negative": 0, "Positive": 1}

# Initialize LabelEncoder with the custom mapping
label_encoder = LabelEncoder()
label_encoder.classes_ = class_mapping.keys()
label_encoder.transform = lambda x: [class_mapping[label] for label in x]

# Fit and transform the target variables using the custom mapping
df_lessreviews["labelFood"] = label_encoder.transform(df_lessreviews["reviewScoreFood"])
df_lessreviews["labelService"] = label_encoder.transform(df_lessreviews["reviewScoreService"])
df_lessreviews["labelAmbiance"] = label_encoder.transform(df_lessreviews["reviewScoreAmbiance"])


In [None]:
# Normalize the price variable
scaler = MinMaxScaler()
df_lessreviews["avgPrice"] = scaler.fit_transform(df_lessreviews[["avgPrice"]])

In [None]:
# preprocessing reviews for SVM and Random Forests
df_preprocessing = df_lessreviews.copy()

# make a copy of the reviewText variable for preprocessing steps for SVM and Random Forests
df_preprocessing["reviewTextSVMRF"] = df_preprocessing["reviewText"].copy()

# make reviews lower cased
df_preprocessing["reviewTextSVMRF"] = df_preprocessing["reviewTextSVMRF"].str.lower()


In [None]:
# remove punctuation
df_preprocessing["reviewTextSVMRF"] = df_preprocessing["reviewTextSVMRF"].str.replace(r'[^\w\s]', '', regex = True)


In [None]:
# tokenizing

# Download the 'punkt' resource
nltk.download('punkt')

df_preprocessing["tokenizedTextSVMRF"] = df_preprocessing["reviewTextSVMRF"].apply(lambda x: word_tokenize(x))


In [None]:
# removing stop words

stop_words = set(stopwords.words("dutch"))

# Define a function to remove Dutch stopwords from the reviews
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

# Apply stop word removal to the 'text_column'
df_preprocessing['tokenizedTextSVMRF'] = df_preprocessing['tokenizedTextSVMRF'].apply(remove_stopwords)


In [None]:
# stemming
stemmer = SnowballStemmer("dutch")
df_preprocessing["stemmedReviewSVMRF"] = df_preprocessing["tokenizedTextSVMRF"].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
                                                                                      

In [None]:
# Apply word embeddings
df_embeddings = df_preprocessing.copy()

word2vec_model = Word2Vec(sentences = df_embeddings["stemmedReviewSVMRF"], vector_size = 100, window = 5, min_count = 5, workers = 4)

# Vocabulary size
wordlist = list(word2vec_model.wv.index_to_key)

# Embedding matrix
embedding_matrix = [word2vec_model.wv[word] for word in wordlist]

# Convert the embedding matrix into a dataframe
word2vec_embeddings = pd.DataFrame(embedding_matrix, index=wordlist)

print(word2vec_embeddings) # Print the embeddings


In [None]:
# function to average word embeddings
def average_word_vectors(wordlist, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype = "float64")
    n_words = 0.
    
    for word in wordlist:
        if word in vocabulary:
            n_words += 1.
            feature_vector += word_vectors[word]
            
    if n_words:
        feature_vector /= n_words
        
    return feature_vector

word_vectors = word2vec_model.wv

# Compute the average word embeddings for each review
vocabulary = set(word_vectors.index_to_key)
df_embeddings["word_vectors"] = [average_word_vectors(wordlist, word_vectors, vocabulary, 100) for wordlist in df_embeddings["stemmedReviewSVMRF"]]

# Convert to array
df_embeddings["array"] = np.array(df_embeddings["word_vectors"])
    

In [None]:
# See how many restaurants are in the final dataframe
unique_restaurants2 = df_embeddings["restoName"].nunique() # number of restaurants
print(unique_restaurants2)

In [None]:
# Encode the City variable with target encoder for SVM and RF

# Initialize the Target Encoder 
encoder_city = ce.TargetEncoder()

# Fit the encoder on the City column and the target variables and make a new column in the dataframe
df_embeddings["CityDummyFood"] = encoder_city.fit_transform(df_embeddings["City"], df_embeddings["labelFood"])
df_embeddings["CityDummyService"] = encoder_city.fit_transform(df_embeddings["City"], df_embeddings["labelService"])
df_embeddings["CityDummyAmbiance"] = encoder_city.fit_transform(df_embeddings["City"], df_embeddings["labelAmbiance"])


In [None]:
# split the data into a train, validation, and test set

# First split the data into train and temp sets
df_train, df_temp = train_test_split(df_embeddings, test_size= 0.2, random_state=68)

# Then split the temp set into validation and test sets
df_validation, df_test = train_test_split(df_temp, test_size= 0.5, random_state=68)

# Print the number of samples in each set
print("Training set samples:", len(df_train))
print("Validation set samples:", len(df_validation))
print("Test set samples:", len(df_test))


# Make train, validation and test sets per aspect

In [None]:
# Define the datsets with x and y variables

# For the Food aspect
x_train_food = np.vstack(df_train["array"].values)
y_train_food = df_train["labelFood"]

x_validation_food = np.vstack(df_validation["array"].values)
y_validation_food = df_validation["labelFood"]

x_test_food = np.vstack(df_test["array"].values)
y_test_food = df_test["labelFood"]

# For the service aspect
x_train_service = np.vstack(df_train["array"].values)
y_train_service = df_train["labelService"]

x_validation_service = np.vstack(df_validation["array"].values)
y_validation_service = df_validation["labelService"]

x_test_service = np.vstack(df_test["array"].values)
y_test_service = df_test["labelService"]

# For the ambiance aspect
x_train_ambiance = np.vstack(df_train["array"].values)
y_train_ambiance = df_train["labelAmbiance"]

x_validation_ambiance = np.vstack(df_validation["array"].values)
y_validation_ambiance = df_validation["labelAmbiance"]

x_test_ambiance = np.vstack(df_test["array"].values)
y_test_ambiance = df_test["labelAmbiance"]


In [None]:
# Define new x sets that also contain information about price and location of the restaurants

# For the Food aspect
x_train_food_exp = np.hstack((np.vstack(df_train["array"].values), df_train[["avgPrice", "CityDummyFood"]].values))
y_train_food_exp = df_train["labelFood"]

x_validation_food_exp = np.hstack((np.vstack(df_validation["array"].values), df_validation[["avgPrice", "CityDummyFood"]].values))
y_validation_food_exp = df_validation["labelFood"]

x_test_food_exp = np.hstack((np.vstack(df_test["array"].values), df_test[["avgPrice", "CityDummyFood"]].values))
y_test_food_exp = df_test["labelFood"]

# For the Service aspect
x_train_service_exp = np.hstack((np.vstack(df_train["array"].values), df_train[["avgPrice", "CityDummyService"]].values))
y_train_service_exp = df_train["labelService"]

x_validation_service_exp = np.hstack((np.vstack(df_validation["array"].values), df_validation[["avgPrice", "CityDummyService"]].values))
y_validation_service_exp = df_validation["labelService"]

x_test_service_exp = np.hstack((np.vstack(df_test["array"].values), df_test[["avgPrice", "CityDummyService"]].values))
y_test_service_exp = df_test["labelService"]

# For the ambiance aspect
x_train_ambiance_exp = np.hstack((np.vstack(df_train["array"].values), df_train[["avgPrice", "CityDummyAmbiance"]].values))
y_train_ambiance_exp = df_train["labelAmbiance"]

x_validation_ambiance_exp = np.hstack((np.vstack(df_validation["array"].values), df_validation[["avgPrice", "CityDummyAmbiance"]].values))
y_validation_ambiance_exp = df_validation["labelAmbiance"]

x_test_ambiance_exp = np.hstack((np.vstack(df_test["array"].values), df_test[["avgPrice", "CityDummyAmbiance"]].values))
y_test_ambiance_exp = df_test["labelAmbiance"]

In [None]:
# Randomly make train sets smaller for sets with experience included to make runtime hyperparameter tuning reasonable

# For the food aspect
random_indices_food = np.random.choice(len(x_train_food_exp), size = 8000, replace = False)
x_train_food_exp_small = x_train_food_exp[random_indices_food]
y_train_food_exp_small = df_train["labelFood"].iloc[random_indices_food]

# For the service aspect
random_indices_service = np.random.choice(len(x_train_service_exp), size = 8000, replace = False)
x_train_service_exp_small = x_train_service_exp[random_indices_service]
y_train_service_exp_small = df_train["labelService"].iloc[random_indices_service]

# For the ambiance aspect
random_indices_ambiance = np.random.choice(len(x_train_ambiance_exp), size = 8000, replace = False)
x_train_ambiance_exp_small = x_train_ambiance_exp[random_indices_ambiance]
y_train_ambiance_exp_small = df_train["labelAmbiance"].iloc[random_indices_ambiance]

# Hyperparameter tuning for SVM models

In [None]:
# Random search to define parameters for SVM

# The parameter grid for the random search
param_dist_SVM = {
    "C": uniform(loc = 0.1, scale = 100-0.1),
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "class_weight": [None, "balanced", {0: 2, 1: 1}],
    "gamma": uniform(loc = 0.001, scale = 1 - 0.001),
    "degree": [2, 3, 4, 5, 6]
}

# Define the scorer for F1-score
scorer = make_scorer(f1_score, average='weighted')

# A list of the features and the targets
feature_target_pairs_SVM = [
    {"X": x_train_food, "y": y_train_food, "X-test": x_validation_food, "y-test": y_validation_food},
    {"X": x_train_service, "y": y_train_service, "X-test": x_validation_service, "y-test": y_validation_service},
    {"X": x_train_ambiance, "y": y_train_ambiance, "X-test": x_validation_ambiance, "y-test": y_validation_ambiance},
    {"X": x_train_food_exp_small, "y": y_train_food_exp_small, "X-test": x_validation_food_exp, "y-test": y_validation_food_exp},
    {"X": x_train_service_exp_small, "y": y_train_service_exp_small, "X-test": x_validation_service_exp, "y-test": y_validation_service_exp},
    {"X": x_train_ambiance_exp_small, "y": y_train_ambiance_exp_small, "X-test": x_validation_ambiance_exp, "y-test": y_validation_ambiance_exp}
]

# Perform random search for each pair
for pair in feature_target_pairs_SVM:
    svm = SVC()
    random_search = RandomizedSearchCV(svm, param_dist_SVM, n_iter=10, cv=5, scoring = scorer, refit = True, random_state = 68)
    random_search.fit(pair["X"], pair["y"])
    print("Best parameters:", random_search.best_params_)
    print("Best cross-validation score:", random_search.best_score_)
    print()

    best_svm = random_search.best_estimator_
    test_score = best_svm.score(pair["X-test"], pair["y-test"])
    print("Validation set score:", test_score)
    print()

In [None]:
# Random search to define parameters for Random forests

# The parameter grid for the randomized search
param_dist_RF = {
    "bootstrap": [True, False],
    "n_estimators": randint(10,800),
    "max_depth": [None] + list(range(5, 31, 5)),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20)
}

# Define the scorer for F1-score
scorer = make_scorer(f1_score, average='weighted')

# A list of the features and the targets
feature_target_pairs_RF = [
    {"X": x_train_food, "y": y_train_food, "X-test": x_validation_food, "y-test": y_validation_food},
    {"X": x_train_service, "y": y_train_service, "X-test": x_validation_service, "y-test": y_validation_service},
    {"X": x_train_ambiance, "y": y_train_ambiance, "X-test": x_validation_ambiance, "y-test": y_validation_ambiance},
    {"X": x_train_food_exp_small, "y": y_train_food_exp_small, "X-test": x_validation_food_exp, "y-test": y_validation_food_exp},
    {"X": x_train_service_exp_small, "y": y_train_service_exp_small, "X-test": x_validation_service_exp, "y-test": y_validation_service_exp},
    {"X": x_train_ambiance_exp_small, "y": y_train_ambiance_exp_small, "X-test": x_validation_ambiance_exp, "y-test": y_validation_ambiance_exp}
]

# Perform randomized search for each pair
for pair in feature_target_pairs_RF:
    rf_classifier = RandomForestClassifier()
    random_search = RandomizedSearchCV(rf_classifier, param_dist_RF, n_iter = 10, cv=5, scoring = scorer, refit = True, random_state = 68)
    random_search.fit(pair["X"], pair["y"])
    print("Best parameters:", random_search.best_params_)
    print("Best cross-validation score:", random_search.best_score_)
    print()

    best_rf = random_search.best_estimator_
    test_score = best_rf.score(pair["X-test"], pair["y-test"])
    print("Validation set score:", test_score)
    print()

# Run optimal models

In [None]:
# Combine the train and validation sets for each aspect

# food aspect
x_train_combined_food = np.vstack([x_train_food, x_validation_food])
y_train_combined_food = pd.concat([y_train_food, y_validation_food])

x_train_combined_food_exp = np.vstack([x_train_food_exp, x_validation_food_exp])
y_train_combined_food_exp = pd.concat([y_train_food_exp, y_validation_food_exp])

# service aspect
x_train_combined_service = np.vstack([x_train_service, x_validation_service])
y_train_combined_service = pd.concat([y_train_service, y_validation_service])

x_train_combined_service_exp = np.vstack([x_train_service_exp, x_validation_service_exp])
y_train_combined_service_exp = pd.concat([y_train_service_exp, y_validation_service_exp])

# ambiance aspect
x_train_combined_ambiance = np.vstack([x_train_ambiance, x_validation_ambiance])
y_train_combined_ambiance = pd.concat([y_train_ambiance, y_validation_ambiance])

x_train_combined_ambiance_exp = np.vstack([x_train_ambiance_exp, x_validation_ambiance_exp])
y_train_combined_ambiance_exp = pd.concat([y_train_ambiance_exp, y_validation_ambiance_exp])

In [None]:
# SVM only review food

# initiate model
svm_food = SVC(C = 26.011, class_weight = None, degree = 4, gamma = 0.148, kernel = "linear")

# Train the model on the combined train and validation sets
svm_food.fit(x_train_combined_food, y_train_combined_food)

# make predictions on the test set
svm_food_pred = svm_food.predict(x_test_food)

# evaluate performance of the model on the test set
class_rep_svm_food = classification_report(y_test_food, svm_food_pred)

print("Classification report SVM food: \n", class_rep_svm_food)

In [None]:
# Generate confusion matrix
cm_svm_food = confusion_matrix(y_test_food, svm_food_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_food, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - SVM Food')
plt.show()

In [None]:
# SVM only review service

# initiate model
svm_service = SVC(C = 26.011, class_weight = None, degree = 4, gamma = 0.148, kernel = "linear")

# Train the model on the combined train and validation sets
svm_service.fit(x_train_combined_service, y_train_combined_service)

# make predictions on the test set
svm_service_pred = svm_service.predict(x_test_service)

# evaluate performance of the model on the test set
class_rep_svm_service = classification_report(y_test_service, svm_service_pred)

print("Classification report SVM service: \n", class_rep_svm_service)

In [None]:
# Generate confusion matrix
cm_svm_service = confusion_matrix(y_test_service, svm_service_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_service, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - SVM Service')
plt.show()

In [None]:
# SVM only review ambiance

# initiate model
svm_ambiance = SVC(C = 54.3249, class_weight = None, degree = 5, gamma = 0.93846, kernel = "rbf")

# Train the model on the combined train and validation sets
svm_ambiance.fit(x_train_combined_ambiance, y_train_combined_ambiance)

# make predictions on the test set
svm_ambiance_pred = svm_ambiance.predict(x_test_ambiance)

# evaluate performance of the model on the test set
class_rep_svm_ambiance = classification_report(y_test_ambiance, svm_ambiance_pred)

print("Classification report SVM ambiance: \n", class_rep_svm_ambiance)

In [None]:
# Generate confusion matrix
cm_svm_ambiance = confusion_matrix(y_test_ambiance, svm_ambiance_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_ambiance, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - SVM Ambiance')
plt.show()

In [None]:
# SVM food with expectations

# initiate model
svm_food_exp = SVC(C = 26.011, class_weight = None, degree = 4, gamma = 0.148, kernel = "linear")

# Train the model on the combined train and validation sets
svm_food_exp.fit(x_train_combined_food_exp, y_train_combined_food_exp)

# make predictions on the test set
svm_food_exp_pred = svm_food_exp.predict(x_test_food_exp)

# evaluate performance of the model on the test set
class_rep_svm_food_exp = classification_report(y_test_food_exp, svm_food_exp_pred)

print("Classification report SVM food with expectations: \n", class_rep_svm_food_exp)

In [None]:
# Generate confusion matrix
cm_svm_food_exp = confusion_matrix(y_test_food_exp, svm_food_exp_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_food_exp, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - SVM Food Expectations')
plt.show()

In [None]:
# SVM service with expectations

# initiate model
svm_service_exp = SVC(C = 26.011, class_weight = None, degree = 4, gamma = 0.148, kernel = "linear")

# Train the model on the combined train and validation sets
svm_service_exp.fit(x_train_combined_service_exp, y_train_combined_service_exp)

# make predictions on the test set
svm_service_exp_pred = svm_service_exp.predict(x_test_service_exp)

# evaluate performance of the model on the test set
class_rep_svm_service_exp = classification_report(y_test_service_exp, svm_service_exp_pred)

print("Classification report SVM service with expectations: \n", class_rep_svm_service_exp)

In [None]:
# Generate confusion matrix
cm_svm_service_exp = confusion_matrix(y_test_service_exp, svm_service_exp_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_service_exp, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - SVM Service Expectations')
plt.show()

In [None]:
# SVM ambiance with expectations

# initiate model
svm_ambiance_exp = SVC(C = 29.0139, class_weight = {0: 2, 1: 1}, degree = 6, gamma = 0.655, kernel = "linear")

# Train the model on the combined train and validation sets
svm_ambiance_exp.fit(x_train_combined_ambiance_exp, y_train_combined_ambiance_exp)

# make predictions on the test set
svm_ambiance_exp_pred = svm_ambiance_exp.predict(x_test_ambiance_exp)

# evaluate performance of the model on the test set
class_rep_svm_ambiance_exp = classification_report(y_test_ambiance_exp, svm_ambiance_exp_pred)

print("Classification report SVM ambiance with expectations: \n", class_rep_svm_ambiance_exp)

In [None]:
# Generate confusion matrix
cm_svm_ambiance_exp = confusion_matrix(y_test_ambiance_exp, svm_ambiance_exp_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_svm_ambiance_exp, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - SVM Ambiance Expectations')
plt.show()

In [None]:
# RF only review food

# initiate model
rf_food = RandomForestClassifier(bootstrap = False, max_depth = 25, min_samples_leaf = 8, min_samples_split = 16, n_estimators = 548)

# Train the model on the combined train and validation sets
rf_food.fit(x_train_combined_food, y_train_combined_food)

# make predictions on the test set
rf_food_pred = rf_food.predict(x_test_food)

# evaluate performance of the model on the test set
class_rep_rf_food = classification_report(y_test_food, rf_food_pred)

print("Classification report Random Forests food: \n", class_rep_rf_food)

In [None]:
# Generate confusion matrix
cm_rf_food = confusion_matrix(y_test_food, rf_food_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_food, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Random Forests Food')
plt.show()

In [None]:
# RF only review service

# initiate model
rf_service = RandomForestClassifier(bootstrap = False, max_depth = 25, min_samples_leaf = 8, min_samples_split = 16, n_estimators = 548)

# Train the model on the combined train and validation sets
rf_service.fit(x_train_combined_service, y_train_combined_service)

# make predictions on the test set
rf_service_pred = rf_service.predict(x_test_service)

# evaluate performance of the model on the test set
class_rep_rf_service = classification_report(y_test_service, rf_service_pred)

print("Classification report Random Forests service: \n", class_rep_rf_service)

In [None]:
# Generate confusion matrix
cm_rf_service = confusion_matrix(y_test_service, rf_service_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_service, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Random Forests Service')
plt.show()

In [None]:
# RF only review ambiance

# initiate model
rf_ambiance = RandomForestClassifier(bootstrap = False, max_depth = 25, min_samples_leaf = 8, min_samples_split = 16, n_estimators = 548)

# Train the model on the combined train and validation sets
rf_ambiance.fit(x_train_combined_ambiance, y_train_combined_ambiance)

# make predictions on the test set
rf_ambiance_pred = rf_ambiance.predict(x_test_ambiance)

# evaluate performance of the model on the test set
class_rep_rf_ambiance = classification_report(y_test_ambiance, rf_ambiance_pred)

print("Classification report Random Forests ambiance: \n", class_rep_rf_ambiance)

In [None]:
# Generate confusion matrix
cm_rf_ambiance = confusion_matrix(y_test_ambiance, rf_ambiance_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_ambiance, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Random Forests Ambiance')
plt.show()

In [None]:
# RF food with expectations

# initiate model
rf_food_exp = RandomForestClassifier(bootstrap = False, max_depth = 15, min_samples_leaf = 9, min_samples_split = 13, n_estimators = 453)

# Train the model on the combined train and validation sets
rf_food_exp.fit(x_train_combined_food_exp, y_train_combined_food_exp)

# make predictions on the test set
rf_food_exp_pred = rf_food_exp.predict(x_test_food_exp)

# evaluate performance of the model on the test set
class_rep_rf_food_exp = classification_report(y_test_food_exp, rf_food_exp_pred)

print("Classification report Random Forests food with expectations: \n", class_rep_rf_food_exp)

In [None]:
# Generate confusion matrix
cm_rf_food_exp = confusion_matrix(y_test_food_exp, rf_food_exp_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_food_exp, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Random Forests Food Expectations')
plt.show()

In [None]:
# RF service with expectations

# initiate model
rf_service_exp = RandomForestClassifier(bootstrap = False, max_depth = 25, min_samples_leaf = 8, min_samples_split = 16, n_estimators = 548)

# Train the model on the combined train and validation sets
rf_service_exp.fit(x_train_combined_service_exp, y_train_combined_service_exp)

# make predictions on the test set
rf_service_exp_pred = rf_service_exp.predict(x_test_service_exp)

# evaluate performance of the model on the test set
class_rep_rf_service_exp = classification_report(y_test_service_exp, rf_service_exp_pred)

print("Classification report Random Forests service with expectations: \n", class_rep_rf_service_exp)

In [None]:
# Generate confusion matrix
cm_rf_service_exp = confusion_matrix(y_test_service_exp, rf_service_exp_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_service_exp, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Random Forests Service Expectations')
plt.show()

In [None]:
# RF ambiance with expectations

# initiate model
rf_ambiance_exp = RandomForestClassifier(bootstrap = False, max_depth = 25, min_samples_leaf = 8, min_samples_split = 16, n_estimators = 548)

# Train the model on the combined train and validation sets
rf_ambiance_exp.fit(x_train_combined_ambiance_exp, y_train_combined_ambiance_exp)

# make predictions on the test set
rf_ambiance_exp_pred = rf_ambiance_exp.predict(x_test_ambiance_exp)

# evaluate performance of the model on the test set
class_rep_rf_ambiance_exp = classification_report(y_test_ambiance_exp, rf_ambiance_exp_pred)

print("Classification report Random Forests ambiance with expectations: \n", class_rep_rf_ambiance_exp)

In [None]:
# Generate confusion matrix
cm_rf_ambiance_exp = confusion_matrix(y_test_ambiance_exp, rf_ambiance_exp_pred)

class_labels = ["Negative", "Positive"]

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf_ambiance_exp, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix - Random Forests Ambiance Expectations')
plt.show()