# Sorting sentiments of hotel reviews through machine learning

Welcome to my (admittedly very basic) machine learning project! Here we'll 
collect our own dataset of hotel reviews, then analyse the dataset, and lastly
build machine learning models to predict hotels' sentiment!

## Imports

Before we begin, let's settle all our imports!

In [None]:
# Essentials
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud

import glob, os, subprocess
import zipfile

sns.set_theme(style="darkgrid", context="notebook")
%matplotlib inline

In [None]:
# NLP
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from langdetect import detect


from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
# Data processing
from funcsigs import signature
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    # auc,
    confusion_matrix,
    classification_report,
    f1_score,
    average_precision_score,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
)

# Preparation
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
)

# Models
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import (
    LogisticRegression,
    SGDClassifier,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

## Collecting our dataset

Let's start by collecting our dataset of hotel reviews. We're using Datafiniti's
[Hotel Reviews](https://www.kaggle.com/datasets/datafiniti/hotel-reviews) dataset
from Kaggle.

We're starting by analysing each review separately. We **tokenise** each
review (split it into individual words) and rank each token's sentiment on a numerical
scale using the [VADER](https://github.com/cjhutto/vaderSentiment) lexicon.

Before you run this code, make sure you've got the Kaggle commandline API
installed.

In [None]:
# !kaggle d download datafiniti/hotel-reviews --force
# subprocess.run(["kaggle", "d", "download", "datafiniti/hotel-reviews", "--force"])

datadir = os.path.join(
    os.path.abspath(os.path.dirname("")),
    "../data/",
)

resultsdir = os.path.join(
    os.path.abspath(os.path.dirname("")),
    "../results/",
)

# with zipfile.ZipFile('hotel-reviews.zip', 'r') as zipdata:
#     zipdata.extractall(datadir)

In [None]:
df = pd.read_csv(os.path.join(datadir, "7282_1.csv"))

# Remove columns that aren't necessary
df = df.drop(
    columns=[
        "address",
        "categories",
        "city",
        "country",
        "latitude",
        "longitude",
        "name",
        "postalCode",
        "province",
        "reviews.date",
        "reviews.dateAdded",
        "reviews.doRecommend",
        "reviews.id",
        "reviews.userCity",
        "reviews.username",
        "reviews.userProvince",
    ],
    axis=1,
)

# Remove rows with NaN
df = df.dropna()

df.head(10)

We can see that the reviews at indices `7`, `8` and `9` aren't in English. We can
use `langdetect` to help us detect the language of the review, and then remove the 
ones that aren't in English.

In [None]:
def detect_lang(text):
    try:
        return detect(text)
    except Exception as e:
        return None


df["reviews.all"] = df["reviews.text"] + " " + df["reviews.title"]
df["reviews.language"] = df["reviews.all"].apply(detect_lang)
df = df[df["reviews.language"] == "en"]
df.head(10)

## Lexicon-based sentiment analysis

We can start using our lexicon to get a numerical value for each of the reviews'
sentiments. Let's create an instance of the VADER lexicon analyser and run it through
each and every review.

In [None]:
# nltk.download("vader_lexicon")
analyzer = SentimentIntensityAnalyzer()

df["reviews.score"] = df["reviews.all"].apply(
    lambda review: analyzer.polarity_scores(review)["compound"]
)

# Remove reviews with a score of 0, because they aren't in English
df = df[df["reviews.score"] != 0.0]
# Remove reviews that are on a scale of 1-10, for standardisation purposes
df = df[df["reviews.rating"] <= 5.0]

# Classify number of stars review gets into three categories: 1 (positive), 0 (neutral), -1 (negative)
max_rating = df["reviews.rating"].max()
mid_rating = np.round(
    0.5 * max_rating
)  # Benchmark to classify positive/neutral/negative
df["reviews.polarity"] = df["reviews.rating"].apply(
    lambda rating: np.sign(rating - mid_rating)
)
df.head(10)

Let's check out how many positive and negative reviews we have, as rated by
the lexicon we're using.

In [None]:
x_counts = df["reviews.polarity"].value_counts()
sns.barplot(x=x_counts.index, y=x_counts).set_title(
    "Proportion of reviews by sentiment"
)

## Preprocessing

Now let's **tokenize** (split our reviews into words) and **lemmatize**
(get the root forms of words) our reviews, so that each review becomes
a string of lowercase words in their root forms.

This should make it much easier to turn them into numbers later.

In [None]:
# Collect words we want to exclude
nltk.download("stopwords")
ickwords = nltk.corpus.stopwords.words("english")

lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()


def cleanup(review: str) -> str:
    tokens = re.findall(r"[\w']+", review)
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    tokens = [
        token for token in tokens if token not in ickwords and token.isalpha()
    ]
    # Lemmatize
    tokens = [porter.stem(token) for token in tokens]
    return " ".join(tokens)

df["reviews.clean"] = df["reviews.all"].apply(cleanup)
df.tail(10)

## Viewing tokens

Let's take a look at our tokens by way of word clouds: we'll examine the most
prominent lemmas that appear in the reviews.

In [None]:
palette = sns.color_palette("crest", as_cmap=True)

lemmas = " ".join(df["reviews.clean"].values)
# print(type(lemmas))
cloud = wordcloud.WordCloud(
    # font_path="clear sans",
    background_color="white",
    colormap="crest",
    width=1280,
    height=960,
    collocations=False,
)
cloud.generate(lemmas)
plt.imshow(cloud, interpolation="bilinear")
plt.title("Word cloud (all lemmas)")
plt.axis("off")
plt.savefig(os.path.join(resultsdir, "wordcloud.png"))

## Vectorization

Let's set our independent and dependent variables: the cleaned review
and the review's polarity respectively.

Now let's extract numerical representations for each of our reviews. We'll
use TF-IDF values (term frequency-inverse document frequency values) to measure
how frequently a token appears in a review, relative to other tokens in the same
review.

In [None]:
x = df["reviews.clean"]
y = df["reviews.polarity"]

x.head(10)

Now let's vectorize our reviews: turn them into numbers.

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(x)

## Model training

The time is finally right to begin training our models! Let's first split
the data we have into training and testing sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=69, stratify=y
)

### Class weights

As you've probably seen before, the number of positive reviews far exceeds the
number of negative reviews. If we trained our model with our current data,
we might end up with a biased model: one that is more biased towards predicting 
the majority class (positive). 

Thus, we'll assign a weight to each class: a higher weight for the minority class,
and a lower weight for the majority class. To see which weights are best,
let's construct a classic logistic regression model to find our best weights.

To do this we'll make a large collection of possible weights our majority class
could take, from $0$ to $1$. Then we'll use grid searching to find the class
weight that produces the best F1 score.

In [None]:
lr = LogisticRegression(solver="newton-cg", class_weight="balanced")
lr.fit(X_train, y_train)

potential_weights = np.linspace(0.0, 0.999, 1000)
param_grid = {
    "class_weight": [
        {0: weight, 1: 1.0 - weight} for weight in potential_weights
    ]
}

gridsearch = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=StratifiedKFold(),
    n_jobs=-1,
    scoring="f1",
    verbose=2,
).fit(X_train, y_train)

weight_tests = pd.DataFrame(
    {
        "f1": gridsearch.cv_results_["mean_test_score"],
        "majority_weight": (1 - potential_weights),
    }
)
sns.lineplot(
    x=weight_tests["majority_weight"],
    y=weight_tests["f1"],
).set_title(
    "F1 score for logistic regression model against majority class weight"
)
plt.savefig(os.path.join(resultsdir, "logistic_f1.png"))

We want the weight that helps us get the highest F1 score.
Let's find the maximum with a bit of code.

In [None]:
f1_max = weight_tests["f1"].max()
best_weight = weight_tests[weight_tests["f1"] == f1_max][
    "majority_weight"
].values[0]

best_weight

In [None]:
weights = {0: 1.0 - best_weight, 1: best_weight}
weights

In [None]:
def confused(truths, predictions, label: str, ax=None):
    """Create a confusion matrix for the given truths and predictions."""
    confusion = confusion_matrix(truths, predictions)
    sns.heatmap(
        confusion / np.sum(confusion),
        fmt=".1%",
        annot=True,
        cmap=palette,
        cbar=False,
        xticklabels=["negative", "positive"],
        yticklabels=["negative", "positive"],
        ax=ax,
    ).set_title(f"Confusion matrix ({label})")
    if ax is not None:
        ax.set_xlabel("Predicted sentiment")
        ax.set_ylabel("True sentiment")
    else:
        plt.xlabel("Predicted sentiment")
        plt.ylabel("Predicted sentiment")
    return confusion


lr_preds = lr.predict(X_test)
confused(y_test, lr_preds, "logistic regression")
plt.savefig(os.path.join(resultsdir, "logistic_confuse.png"))

In [None]:
def precision_recall(truths, predictions, label: str, ax):
    """Create a precision-recall curve for the given truths and predictions."""
    avg_precision = average_precision_score(truths, predictions, pos_label=1)
    precision, recall, _ = precision_recall_curve(truths, predictions)
    step_kwargs = (
        {"step": "post"}
        if "step" in signature(plt.fill_between).parameters
        else {}
    )
    ax.step(
        recall,
        precision,
        where="post",
    )
    ax.fill_between(recall, precision, alpha=0.5, **step_kwargs)
    ax.axhline(
        y=avg_precision,
        label=f"Avg.: {avg_precision:.3f}",
        linestyle="--",
        color=(0.8666666666666667, 0.5176470588235295, 0.3215686274509804),
    )

    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.set_ylim(0.0, 1.0)
    ax.set_xlim(0.0, 1.0)
    ax.set_title(f"Precision-recall curve ({label})")
    ax.legend(loc="best")

In [None]:
def roc(truths, predictions, label: str, ax):
    """Create a ROC curve for the given truths and predictions."""
    fpr, tpr, _ = roc_curve(truths, predictions)
    roc_auc = roc_auc_score(truths, predictions)
    ax.plot(
        fpr,
        tpr,
        color=(0.8666666666666667, 0.5176470588235295, 0.3215686274509804),
        label=f"Area: {roc_auc:.2f}",
    )
    ax.plot(
        [0, 1],
        [0, 1],
        linestyle="--",
        label="Random classifier",
    )
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])
    ax.set_xlabel("False positive rate")
    ax.set_ylabel("True positive rate")
    ax.set_title(f"ROC curve ({label})")
    ax.legend(loc="best")

### Random forest classifier

Let's first create a random forest classifier, a classification model 
commonly used in text classification, like this. It's quite popular because
it yields high accuracies.

In [None]:
rf = RandomForestClassifier(n_estimators=100, class_weight=weights)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

Let's evaluate it on a few metrics. First, let's see a report on how it did
classifying our test data.

In [None]:
print(classification_report(y_test, rf_preds))

Quite good! We have an F1 score of $0.94$ for the positive reviews, and high
precisions for both classes. Now let's see a confusion matrix.

In [None]:
rf_confusion = confusion_matrix(y_test, rf_preds)
rf_confusion

In [None]:
confused(y_test, rf_preds, "random forest")
plt.savefig(os.path.join(resultsdir, "rndforst_confuse.png"))

### Support vector machine (SVM)

Let's try a support vector machine next! They're also quite widely used within
the realm of text classification.

In [None]:
svm = SGDClassifier(loss="hinge", class_weight=weights)
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)

Let's again test it with our metrics:

In [None]:
print(classification_report(y_test, svm_preds))

In [None]:
confused(y_test, svm_preds, "support vector machine")
plt.savefig(os.path.join(resultsdir, "svm_confuse.png"))

### Naive Bayes classifier

The Naive Bayes classifier is a probabilistic machine learning model based on Bayes' theorem.
It's not that well-known, but has been shown to produce decent results.

In [None]:
naive = BernoulliNB()
naive.fit(np.asarray(X_train.todense()), y_train, sample_weight=weights[0])
naive_preds = naive.predict(X_test)

You know the drill. Let's make a report and a confusion matrix.

In [None]:
print(classification_report(y_test, naive_preds))

In [None]:
confused(y_test, naive_preds, "Naive Bayes")
plt.savefig(os.path.join(resultsdir, "nb_confuse.png"))

### K-nearest neighbours

Lastly, let's round this classification exercise off with a K-nearest neighbours
(KNN) classifier. It's most commonly used in any type of classification problem,
so it's definitely worth a shot.

KNN doesn't use numerical weights as we've been doing for other classification models,
but we can weigh points based on the inverse of their distance. Let's use that then!

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)

Again, let's use the same metrics: a report and a confusion matrix:

In [None]:
print(classification_report(y_test, knn_preds))

In [None]:
confused(y_test, knn_preds, "K-nearest neighbours")
plt.savefig(os.path.join(resultsdir, "knn_confuse.png"))

### K-means clustering

We'll now use an **unsupervised classification** technique to classify the reviews.
K-means clustering is one of the more popular choices.

In [None]:
kmeans = KMeans(n_clusters=2, init="random")
kmeans.fit(X_train)

kmeans_preds = kmeans.predict(X_test)
confused(y_test, kmeans_preds, "K-means clustering")
plt.savefig(os.path.join(resultsdir, "kmeans_confuse.png"))

## Consolidating our metrics

Finally, let's get as much metrics as we want from all our models, and then export
them.

In [None]:
models = {
    lr: lr.predict(X_test),
    rf: rf_preds,
    svm: svm_preds,
    knn: knn_preds,
    naive: naive_preds,
    kmeans: kmeans_preds,
}
approximate = lambda x: round(x, 3)
model_metrics = pd.DataFrame(
    data={
        "name": [model.__class__.__name__ for model, _ in models.items()],
        "f1": [
            approximate(f1_score(y_test, preds)) for _, preds in models.items()
        ],
        "auroc": [
            approximate(roc_auc_score(y_test, preds))
            for _, preds in models.items()
        ],
        "averageprecision": [
            approximate(average_precision_score(y_test, preds))
            for _, preds in models.items()
        ],
    }
)

model_metrics

In [None]:
f, axes = plt.subplots(2, 3, figsize=(18, 12))

for i, (model, preds) in enumerate(models.items()):
    precision_recall(
        y_test, preds, model.__class__.__name__, axes[i // 3, i % 3]
    )
f.suptitle(f"Precision-recall curves")
plt.savefig(os.path.join(resultsdir, f"all_prc.png"))

In [None]:
f, axes = plt.subplots(2, 3, figsize=(18, 12))

for i, (model, preds) in enumerate(models.items()):
    confused(y_test, preds, model.__class__.__name__, axes[i // 3, i % 3])
f.suptitle(f"Confusion matrices")
plt.savefig(os.path.join(resultsdir, f"all_confuse.png"))

In [None]:
f, axes = plt.subplots(2, 3, figsize=(18, 12))

for i, (model, preds) in enumerate(models.items()):
    roc(y_test, preds, model.__class__.__name__, axes[i // 3, i % 3])
f.suptitle(f"Receiver operating characteristic curves")
plt.savefig(os.path.join(resultsdir, f"all_roc.png"))

In [None]:
# Remove old model metrics
for f in glob.glob(os.path.join(resultsdir, "model_metrics_*")):
    os.remove(f)

# Save new model metrics
today = pd.to_datetime("today").strftime("%d%m%Y")
model_metrics.to_csv(
    os.path.join(
        resultsdir,
        f"model_metrics_{today}.csv",
    ),
    index=False,
)