In [28]:
# data processing
import string

# plotting
import matplotlib.pyplot as plt
import numpy as np
# natural language processing
import nltk
import pandas as pd
import seaborn as sns
from funcsigs import signature
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (auc, average_precision_score,
                             precision_recall_curve, roc_curve)
from sklearn.model_selection import train_test_split

nltk.data.path.append("/usr/share/nltk_data/")

# matplotlib things
plt.style.use("seaborn-v0_8")

In [29]:
# import the data
df = pd.read_csv("./data/combined_sentiments.csv",
                 header=0,
                 sep=",",
                 on_bad_lines="skip")

# lemmatise


def get_wordnet_pos(tag):
    """identify each word by its part of speech
    and return that part of speech, for lemmatisation."""
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN


# check whether there is a digit or not


def check_digits(text):
    """check whether a piece of text
    contains numerical digits."""
    return any(i.isdigit() for i in text)


# tokenise


def clean_review(review):
    """removes stop words from each review,
    then tokensises them."""
    review = str(review)
    review = review.lower() # turn into lowercase
    review = [word.strip(string.punctuation)
              for word in review.split(" ")] # remove punctuation
    # remove digits
    review = [word for word in review if not check_digits(word)]

    # remove stop words
    stop = stopwords.words("english")
    review = [token for token in review if token not in stop]
    # remove empty tokens
    review = [token for token in review if len(token) > 0]

    # tag each token with its part of speech (pos)
    pos_tags = pos_tag(review)
    review = [
        WordNetLemmatizer().lemmatize(tag[0], get_wordnet_pos(tag[1]))
        for tag in pos_tags
    ]

    # remove words with only one letter
    review = [token for token in review if len(token) > 1]
    review = " ".join(review)
    return review
# print(type(clean_review("Housekeeper kept our rooms clean. Skyline studios very spacious & modern. Lovely big bathroom with well stocked amenities. Poolside seating & Olympic-sized pool was enjoyable.")))
# print(clean_review("Housekeeper kept our rooms clean. Skyline studios very spacious & modern. Lovely big bathroom with well stocked amenities. Poolside seating & Olympic-sized pool was enjoyable."))

# generate a cleaned, tokenised and lemmatised version of the reviews
df["reviews.clean"] = df["reviews.text"].apply(clean_review)

In [30]:
# generate a frequency dictionary
def build_freqs():
	'''
	takes reviews and their polarities as input,
	go through every tweet, preprocess them, count
	the occurrence of every word in the data set and
	create a frequency dictionary.
	'''
	reviews = df['reviews.clean'].tolist()
	polarities = df['sent.net'].tolist()
	# print(reviews[:7], polarities[:7])
	freqs = {}
	for review, polarity in zip(reviews, polarities):
		for word in review.split():
			pair = (word, polarity)
			freqs[pair] = freqs.get(pair, 0) + 1
	return freqs

# print(build_freqs())

In [31]:
def sigmoid(z):
	'''
	calculate the sigmoid of z.
	'''
	h = 1 / (1 + np.exp(-z))
	return h

def gradient_descent(x, y, theta, alpha, iters):
	'''
	input:
		x: matrix of features, dimensions m by n + 1
		y: corresponding labels of the input matrix x
		theta: weight vector of dimensions n + 1 by 1
		alpha: learning rate
		iters: number of training iterations on the model
	output:
		j: final cost	
		theta: final weight vector
	'''
	m = len(x)
	for _ in range(0, iters):
		z = np.dot(x, theta)
		h = sigmoid(z)
		j = (-1 / m) * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1 - h)))
		theta = theta - (alpha / m) * np.dot(x.T, h - y)
		print(j)
	j = float(j)
	return j, theta

In [32]:
def extract_features(review, freqs):
	x = np.zeros((1, 3))
	# set bias value to 1
	x[0, 0] = 1
	for word in review.split():
		# increment the word count for the positive label 1
		x[0, 1] += freqs.get((word, 1), 0)
		# increment the word count for the negative label 0
		x[0, 2] += freqs.get((word, 0), 0)
	assert(x.shape == (1,3))
	return x

In [33]:
# do a 70-30 train-test split
train_pos = df[df['sent.net']==1]['reviews.clean'].tolist()[:7001]
# print((test_pos))
test_pos = df[df['sent.net']==1]['reviews.clean'].tolist()[7001:]
train_neg = df[df['sent.net']==-1]['reviews.clean'].tolist()[7001:]
test_neg = df[df['sent.net']==-1]['reviews.clean'].tolist()[:7001]
train_x = train_pos + train_neg
test_x = test_pos + test_neg
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [34]:
# collect the features x and stack them into a matrix X
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], build_freqs())

# training labels corresponding to X
Y = train_y

# apply gradient descent
j, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {j:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

[[0.69314718]]
[[0.66369683]]
[[0.63677128]]
[[0.61212213]]
[[0.5895107]]
[[0.56871654]]
[[0.5495413]]
[[0.53180937]]
[[0.51536663]]
[[0.50007841]]
[[0.4858271]]
[[0.47250993]]
[[0.4600369]]
[[0.448329]]
[[0.43731666]]
[[0.42693842]]
[[0.41713984]]
[[0.40787251]]
[[0.39909328]]
[[0.39076354]]
[[0.38284869]]
[[0.37531759]]
[[0.36814216]]
[[0.36129702]]
[[0.35475917]]
[[0.3485077]]
[[0.34252357]]
[[0.33678944]]
[[0.33128941]]
[[0.32600893]]
[[0.32093466]]
[[0.3160543]]
[[0.31135655]]
[[0.30683096]]
[[0.30246788]]
[[0.29825838]]
[[0.29419418]]
[[0.29026758]]
[[0.28647143]]
[[0.28279909]]
[[0.27924433]]
[[0.27580138]]
[[0.27246481]]
[[0.26922957]]
[[0.26609091]]
[[0.26304438]]
[[0.26008582]]
[[0.25721129]]
[[0.25441711]]
[[0.25169981]]
[[0.24905612]]
[[0.24648294]]
[[0.24397736]]
[[0.24153664]]
[[0.23915817]]
[[0.23683948]]
[[0.23457824]]
[[0.23237225]]
[[0.2302194]]
[[0.22811771]]
[[0.22606528]]
[[0.22406031]]
[[0.22210111]]
[[0.22018604]]
[[0.21831355]]
[[0.21648217]]
[[0.21469049]]
[[0.

  j = (-1 / m) * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1 - h)))


[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]
[[nan]]


In [35]:
def predict_review(review, freqs, theta):
    '''
    input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    
    # extract the features of the tweet and store it into x
    x = extract_features(review, freqs)
    
    # make the prediction using x and theta
    z = np.dot(x, theta)
    y_pred = sigmoid(z)
    return y_pred


def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    input: 
        test_x: a list of reviews
        test_y: (m, 1) vector with the corresponding labels for the list of reviews
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    output: 
        accuracy: (# of tweets classified correctly) / (total # of reviews)
    """
        
    # the list for storing predictions
    y_hat = []
    
    for review in test_x:
        # get the label prediction for the tweet
        y_pred = predict_review(review, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)
		# With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    y_hat = np.array(y_hat)
    test_y = test_y.reshape(-1)
    accuracy = np.sum((test_y == y_hat).astype(int))/len(test_x)
    
    return accuracy

print("accuracy of model: %" % test_logistic_regression(test_x, test_y, build_freqs(), theta))

ValueError: incomplete format