In [3]:
#parts of this code, such as the gradient descent algorithm and feature extraction were taken from the following website:
'''
***************************************************************************************/
*    Title: Sentiment Analysis of Twitter’s US Airlines Data using KNN Classifications
*    Author: Atharva Mashalkar
*    Date: 2020
*    Availability: https://towardsdatascience.com/sentiment-analysis-using-logistic-regression-and-naive-bayes-16b806eb4c4b
*
***************************************************************************************/
'''
import string
import csv
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import numpy as np
from sklearn import metrics
from matplotlib import pyplot as plt
!wget https://raw.githubusercontent.com/lee1613/Sentiment-Analysis-NLP-/main/dataset.csv

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


--2023-04-01 12:42:41--  https://raw.githubusercontent.com/lee1613/Sentiment-Analysis-NLP-/main/dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1273144 (1.2M) [text/plain]
Saving to: ‘dataset.csv.1’


2023-04-01 12:42:41 (22.7 MB/s) - ‘dataset.csv.1’ saved [1273144/1273144]



In [4]:
#loading of dataset here
with open("dataset.csv") as f: 
    reader = csv.reader(f,delimiter = ",")
    labels  = []
    comments = []
    counter = 0
    for line in reader:
        labels += line[0]
        comments += [line[1]]

comments = np.array(comments) #sentences
labels = np.array(labels) #labels for each sentence
labels = labels.astype(int)
print(comments.size, labels.size)
print(comments[21])

10662 10662
with a cast that includes some of the top actors working in independent film , lovely & amazing involves us because it is so incisive , so bleakly amusing about how we go about our lives .


In [5]:
#preprocessing
for i, sen in enumerate(comments):
  comments[i] = sen.lower() #set sentences to lowercase
stopwords = stopwords.words('english')
special_chars = ["!",'"',"%","&","'","(",")", "*","+",",","-",".",
                  "/",":",";","<","=",">","?","[","\\","]","^","_",
                  "`","{","|","}","~","–","@","#","$"]
for i, sen in enumerate(comments):
  temp = ''
  for word in sen.split():
    if word not in stopwords and word not in special_chars: #removes stopwords and special characters
      temp = temp + ' ' + word #appends the remaining words together
  comments[i] = temp
print(comments)

[" rock destined 21st century's new conan he's going make splash even greater arnold schwarzenegger jean-claud van damme steven segal"
 " gorgeously elaborate continuation lord rings trilogy huge column words cannot adequately describe co-writer/director peter jackson's expanded vision j r r tolkien's middle-earth"
 ' effective too-tepid biopic' ...
 " stands crocodile hunter hurried badly cobbled look 1959 godzilla combined scenes japanese monster flick canned shots raymond burr commenting monster's path destruction"
 ' thing looks like made-for-home-video quickie'
 ' enigma well-made dry placid']


In [6]:
#sigmoid function
def sigmoid(z): 
    h = 1/(1 + np.exp(-z)) 
    return h

In [7]:
#gets total occurrences of each word for each label
def get_occurrence(sentences, label):
    freqs = {}
    for y, sentence in zip(label, sentences):
        for word in sentence.split():
            pair = (word, y) #pairs up the word with its corresponding sentiment
            freqs[pair] = freqs.get(pair, 0) + 1 #occurrence + 1
            
    return freqs

In [8]:
#gradient descent algorithm for logistic regression
def gradDescent(x, y, theta, learning_rate, iter):
  l = len(x)
  for i in range(0, iter):
    z = np.dot(x,theta)
    h = sigmoid(z)
    J = (-1/l)*(np.dot(y.T,np.log(h)) + np.dot((1-y).T,np.log(1-h))) #cost function
    theta = theta - (learning_rate/l)*np.dot(x.T, h-y) #changing of weights
    #if i%50 == 0:
    #  print(J) #check to ensure cost function is decreasing
  J = float(J)
  return J, theta

In [9]:
#getting parameters/features of logistic regression. We used word occurrences from each sentence type as a gauge. 
def get_params(sentences, freqs):
    X = np.zeros((len(sentences), 3))
    x = np.zeros((1, 3)) 
    for i, sentence in enumerate(sentences):
      x[0,0] = 1
      x[0,1] = 0
      x[0,2] = 0
      for word in sentence.split():
          x[0,1] += freqs.get((word,1),0) #acts as a gauge as to how positive a sentence is
          x[0,2] += freqs.get((word,0),0) #acts as a gauge as to how negative a sentence is
      X[i, :] = x

    return X
freq = get_occurrence(comments, labels)
print(freq)
features = get_params(comments, freq)
print(features, features.shape)

[[  1. 568. 575.]
 [  1. 128.  85.]
 [  1.  28.  12.]
 ...
 [  1. 248. 254.]
 [  1. 342. 569.]
 [  1.  19.  26.]] (10662, 3)


In [29]:
#splitting of dataset
n = features.shape[0]
k = .8
training_n = int(n*.8)
testing_n = n - training_n
list_of_indices = np.arange(0,n)
np.random.seed(0) #setting the seed to make data randomization across the board the same, can comment out and run to check for consistency in model
np.random.shuffle(list_of_indices) #randomly shuffles all data
training_indices =list_of_indices[:training_n]
testing_indices = list_of_indices[training_n:]
train_x = np.array([features[i] for i in training_indices])
train_y = np.array([labels[i] for i in training_indices])
test_x = np.array([features[i] for i in testing_indices])
test_y = np.array([labels[i] for i in testing_indices])
train_y = np.reshape(train_y, (len(train_y), 1))
test_y = np.reshape(test_y, (len(test_y), 1))
print(train_x.shape,train_y.shape,test_x.shape, test_y.shape)
print(train_y.ndim)

(8529, 3) (8529, 1) (2133, 3) (2133, 1)
2


In [30]:
#running of logistic regression with all zeros for starting params
J, theta = gradDescent(train_x, train_y, np.zeros((3, 1)), 1e-9, 1500)
print(f"Final cost:{J:.8f}.")
print(f"Final vector weights = {[round(t, 8) for t in np.squeeze(theta)]}")

Final cost:0.69251149.
Final vector weights = [-0.0, 2.058e-05, -2.304e-05]


In [31]:
#predicts an individual sentence
def predict_y(x, freqs, theta):
    return sigmoid(np.dot(x,theta))
#runs through every sentence and calculates accuracy
def test_logistic_regression(test_x, test_y, freqs, theta):
    y_predicted = []
    for params in test_x:
        temp = predict_y(params, freqs, theta)
        if temp > 0.5: #determines predicted label based on sigmoid function output 
            y_predicted.append(1)
        else:
            y_predicted.append(0)
    y_predicted = np.array(y_predicted)
    test_y = test_y.reshape(test_y.size)
    #prints confusion matrix for test data
    result = metrics.confusion_matrix(test_y,y_predicted)
    print(result)
    #prints metrics for test data
    print(f"Accuracy = {metrics.accuracy_score(test_y,y_predicted)*100:.2f}%")
    print(f"F1 score = {metrics.f1_score(test_y,y_predicted)*100:.2f}%")
    print(f"Precision = {metrics.precision_score(test_y,y_predicted)*100:.2f}%")
    print(f"Recall = {metrics.recall_score(test_y,y_predicted)*100:.2f}%")
test_logistic_regression(test_x, test_y, freq, theta)

[[847 196]
 [414 676]]
Accuracy = 71.40%
F1 score = 68.91%
Precision = 77.52%
Recall = 62.02%
