In [1]:
# data processing
import string

# plotting
import matplotlib.pyplot as plt
import numpy as np
# natural language processing
import nltk
import pandas as pd
import seaborn as sns
from funcsigs import signature
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (auc, average_precision_score,
                             precision_recall_curve, roc_curve)
from sklearn.model_selection import train_test_split

nltk.data.path.append("/usr/share/nltk_data/")

# matplotlib things
plt.style.use("seaborn-v0_8")

In [2]:
# import the data
df = pd.read_csv("./data/combined_sentiments.csv",
                 header=0,
                 sep=",",
                 on_bad_lines="skip")

# lemmatise


def get_wordnet_pos(tag):
    """identify each word by its part of speech
    and return that part of speech, for lemmatisation."""
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


# check whether there is a digit or not


def check_digits(text):
    """check whether a piece of text
    contains numerical digits."""
    return any(i.isdigit() for i in text)


# tokenise


def clean_review(review):
    """removes stop words from each review,
    then tokensises them."""
    review = str(review)
    review = review.lower() # turn into lowercase
    review = [word.strip(string.punctuation)
              for word in review.split(" ")] # remove punctuation
    # remove digits
    review = [word for word in review if not check_digits(word)]

    # remove stop words
    stop = stopwords.words("english")
    review = [token for token in review if token not in stop]
    # remove empty tokens
    review = [token for token in review if len(token) > 0]

    # tag each token with its part of speech (pos)
    pos_tags = pos_tag(review)
    review = [
        WordNetLemmatizer().lemmatize(tag[0], get_wordnet_pos(tag[1]))
        for tag in pos_tags
    ]

    # remove words with only one letter
    review = [token for token in review if len(token) > 1]
    review = " ".join(review)
    return review
# print(type(clean_review("Housekeeper kept our rooms clean. Skyline studios very spacious & modern. Lovely big bathroom with well stocked amenities. Poolside seating & Olympic-sized pool was enjoyable.")))
# print(clean_review("Housekeeper kept our rooms clean. Skyline studios very spacious & modern. Lovely big bathroom with well stocked amenities. Poolside seating & Olympic-sized pool was enjoyable."))

# generate a cleaned, tokenised and lemmatised version of the reviews
df["reviews.clean"] = df["reviews.text"].apply(clean_review)

In [None]:
# generate a frequency dictionary
# def build_freqs(reviews, ys):