# Research Question 3: Logistic Regression

Here, we build a simple logistic regression model to classify hotel reviews
into two basic categories: those with negative and those with positive sentiment.

In [None]:
%matplotlib inline

import os

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(context="talk", style="white")  # , font="serif")


# natural language processing
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string

# machine learning imports
from funcsigs import signature
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    auc,
    average_precision_score,
    precision_recall_curve,
    roc_curve,
    confusion_matrix,
    f1_score
)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# import the data
DATADIR = os.path.join(
    os.path.abspath(os.path.dirname("")), "../data"
)
DF = pd.read_csv(
    os.path.join(DATADIR, "combined_sentiments.csv"),
    header=0,
    sep=",",
    on_bad_lines="skip",
)
STOP = stopwords.words("english")


# lemmatise
def get_wordnet_pos(pos_tag):
    """lemmatises words by classifying them into their
    respective parts of speech"""
    if pos_tag.startswith("J"):
        return wordnet.ADJ
    elif pos_tag.startswith("V"):
        return wordnet.VERB
    elif pos_tag.startswith("N"):
        return wordnet.NOUN
    elif pos_tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


def check_digits(text):
    """is there a digit in the text?"""
    return any(i.isdigit() for i in text)


def clean_review(review):
    """tokenise and clean up punctuation"""
    review = str(review).lower()
    review = [
        word.strip(string.punctuation)
        for word in review.split(" ")
    ]  # remove punctuation
    review = [
        word for word in review if not check_digits(word)
    ]  # remove digits

    # remove stop words
    review = [
        token for token in review if token not in STOP
    ]

    # remove empty tokens
    review = [token for token in review if len(token) > 0]

    # tag each token with its part of speech (pos)
    pos_tags = pos_tag(review)
    review = [
        WordNetLemmatizer().lemmatize(
            tag[0], get_wordnet_pos(tag[1])
        )
        for tag in pos_tags
    ]

    # remove words with only one letter
    review = [token for token in review if len(token) > 1]
    review = " ".join(review)
    return review


# generate a cleaned, tokenised and lemmatised version of the reviews
DF["reviews.clean"] = DF["reviews.text"].apply(clean_review)
REVIEWS_CLEAN = DF["reviews.clean"]

# get a list of all the reviews, and extract all the tokens as one big list
REVIEWS_ALL = DF["reviews.clean"].values.tolist()

# TF-IDF

Let's get the term frequency-inverse document frequencies for each token
in each review.

The **raw term frequency** for a token in a document (a review) is $\text{tf}\left(t, d\right)$,
where $t$ is a term (a token) and $d$ is a document.
The number of documents where where term $t$ appears is $\text{df}\left(t, d\right)$,
and the inverse document frequency $\text{idf}\left(t, d\right) = \log \frac{n_d}{1 + \text{df}\left(d, t\right)}$.

The TF-IDF is thus $\text{tfidf}\left(t, d\right) = \text{tf}\left(t, d\right)\text{idf}\left(t, d\right)$.

In [None]:
# create a feature vector that counts the number of times each word appears in the review
COUNT_VECTORISER = CountVectorizer()
WORDBAG = COUNT_VECTORISER.fit_transform(REVIEWS_ALL)

COUNT_VECTORISER.vocabulary_

In [None]:
# create a tfidf transformer instance...
TFIDF_TRANSFORM = TfidfTransformer(
    use_idf=True,
    norm="l2",
    smooth_idf=True,
    sublinear_tf=True,
)


# ...and feed it the data
TFIDF_TRANSFORM.fit_transform(WORDBAG).toarray()

# Preparing to train

Let's split our datasets, with the independent variable being
the token counts in each review and the dependent variable being the polarity
of the review (positive or negative).

We'll also create a **parameter grid** for the model to select from, to determine
the best parameters to train the model on.

In [None]:
# split into train, test and validation sets
X = REVIEWS_CLEAN
y = DF["sent.polarity"]

X_t, X_test, y_t, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_t, y_t, test_size=0.25, random_state=1, stratify=y_t
)

# create a parameter grid for the model to pick the best params
PARAM_GRID = [
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [STOP, None],
        "clf__penalty": ["l1", "l2"],
        "clf__C": [1.0, 10.0, 100.0],
    },
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [STOP, None],
        "vect__use_idf": [False],
        "vect__norm": [None],
        "clf__penalty": ["l1", "l2"],
        "clf__C": [1.0, 10.0, 100.0],
    },
]

In [None]:
TFIDF_VECTORISER = TfidfVectorizer(
    strip_accents=None, lowercase=False, preprocessor=None
)
pipeline = Pipeline(
    [
        ("vect", TFIDF_VECTORISER),
        ("clf", LogisticRegression(random_state=42)),
    ]
)

gridsearch = GridSearchCV(
    pipeline,
    PARAM_GRID,
    scoring="accuracy",
    cv=5,
    verbose=1,
    n_jobs=-1,
)
gridsearch.fit(X_train, y_train)

# Evaluating Accuracy

Next, we can calculate the best and in-test accuracies. We can also plot
graphs, the precision-recall and receiver operating characteristic
graph, to evaluate the model's precision.

In [None]:
print(f"best accuracy: {gridsearch.best_score_:.5f}")

clf = gridsearch.best_estimator_
print(f"accuracy in test: {clf.score(X_test, y_test):.5f}")

# Making Predictions

We can ask the model to create a list of predictions for us from
values in the dataset. We can then calculate metrics such as the
false positive and true positive rate, which will aid us in plotting
the receiver operating characteristic graph later.

In [None]:
np.set_printoptions(threshold=np.inf)
PREDS = clf.predict(X_val)


ACTUALS = y_val.to_numpy()

# standardise to binary classifications


ACTUALS[ACTUALS == -1] = 0
PREDS[PREDS == -1] = 0

## Receiver Operating Characteristic

The receiver operating characteristic curve plots the rate of true positive predictions made
against the rate of false positive predictions. The dotted line shows
a scenario where the two are equal, and the plotted curve shows how well
the model fares against it. The higher the curve lies over the dotted line,
the higher the model's accuracy.

In [None]:
RESULTSDIR = os.path.join(
    os.path.abspath(os.path.dirname("")),
    "../results/",
)


FALSE_POS_RATE, TRUE_POS_RATE, thresholds = roc_curve(
    ACTUALS, PREDS, pos_label=1
)
roc_auc = auc(FALSE_POS_RATE, TRUE_POS_RATE)

plt.plot(
    FALSE_POS_RATE,
    TRUE_POS_RATE,
    label=f"ROC (Area: {roc_auc:.3f})",
)
plt.plot(
    [0, 1],
    [0, 1],
    linestyle="--",
    label="Random classifier",
)

plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC (logistic regression)")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig(os.path.join(RESULTSDIR, "logistic_roc.png"))
# plt.show()

# Precision-recall

> Precision-recall is a useful measure of success of prediction when the classes are very imbalanced. In information retrieval, precision is a measure of result relevancy, while recall is a measure of how many truly relevant results are returned.
> The precision-recall curve shows the tradeoff between precision and recall for different thresholds.

In [None]:
AVG_PRECISION = average_precision_score(
    ACTUALS, PREDS, pos_label=1
)
PRECISION, RECALL, _ = precision_recall_curve(
    ACTUALS, PREDS
)
step_kwargs = (
    {"step": "post"}
    if "step" in signature(plt.fill_between).parameters
    else {}
)

plt.step(
    RECALL,
    PRECISION,
    where="post",
)
plt.fill_between(
    RECALL, PRECISION, alpha=0.5, **step_kwargs
)
plt.axhline(
    y=AVG_PRECISION,
    label=f"Avg.: {AVG_PRECISION:.3f}",
    linestyle="--",
    color=(0.8666666666666667, 0.5176470588235295, 0.3215686274509804),
)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.title(f"Precision-recall (logistic regression)")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig(os.path.join(RESULTSDIR, "logistic_prc.png"))

# Confusion matrix

We plot the number of true and false positive predictions, as well as
the number of true and false negative predictions.

In [None]:
confusion = confusion_matrix(ACTUALS, PREDS)
confusion

In [None]:
sns.heatmap(
    confusion / np.sum(confusion),
    fmt=".2%",
    annot=True,
    cmap=sns.diverging_palette(230, 20, as_cmap=True),
    xticklabels=["negative", "positive"],
    yticklabels=["negative", "positive"],
)
plt.xlabel("Predicted sentiment")
plt.ylabel("True sentiment")
plt.title("Confusion matrix (logistic regression)")
plt.tight_layout()
plt.savefig(
    os.path.join(RESULTSDIR, "logistic_confuse.png")
)

In [None]:
f1_score(ACTUALS, PREDS)