In [9]:
import re
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import math
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from collections import defaultdict

data = pd.read_csv('train.tsv', sep = '\t') 
data_train = pd.read_csv('test.tsv', sep = '\t')
data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [19]:
# data preprocessing 

def remove_tags(string): 
    removelist = ""
    result = re.sub('<.*?>', '', string)  # remove HTML tags
    result = re.sub('https://.*', '', result)   # Remove URLs
    result = re.sub(r'[^\w\s]', ' ', result)    # Remove non-alphanumeric characters
    result = result.lower()
    return result 

# remove stop words that hold no meaning for sentiment 
data['Phrase']=data['Phrase'].apply(lambda cw : remove_tags(cw))
stop_words = set(stopwords.words('english'))
data['Phrase'] = data['Phrase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [23]:
# perform lemmatization to find root form of words 

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['Phrase'] = data.Phrase.apply(lemmatize_text) 

data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,series escapade demonstrating adage good goose...,1
1,2,1,series escapade demonstrating adage good goose,2
2,3,1,series,2
3,4,1,,2
4,5,1,series,2


In [27]:
print(data['Sentiment'].value_counts())

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64


In [41]:
# Feature extraction

phrases = data['Phrase'].values
sentiment_score = data['Sentiment'].values
train_phrases, test_phrases, train_labels, test_labels = train_test_split(phrases, sentiment_score) 

# start vectorization 
vector = CountVectorizer(max_features = 3000) 
X = vector.fit_transform(train_phrases) 
vocab = vector.get_feature_names_out() 

# initialize word counts 
X = X.toarray() 
word_counts = {}
for i in range(5):
    word_counts[i] = defaultdict(lambda: 0)

# store frequency of word counts in a dictionary 
for j in range(X.shape[0]):
    i = train_labels[j]
    for h in range(len(vocab)): 
        word_counts[i][vocab[h]] += X[j][h]


In [67]:
# Perform laplace smoothing the difference of words between training and test set

def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label): 
    a = word_counts[text_label][word] + 1 
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b) 

# define fit and predict functions for the classifier 

def group_by_label(x, y, labels): 
    dict_data = defaultdict(list) 
    for l in labels: 
        dict_data[l] = x[np.where(y == l)]
    return dict_data

def fit(x, y, labels): 
    n_label_items = {} 
    log_label_priors = {} 
    n = len(x) 
    grouped_data = group_by_label(x, y, labels) 
    for l, dict_data in grouped_data.items(): 
        n_label_items[l] = len(dict_data)
        log_label_priors[l] = math.log(n_label_items[l] / n) 
    return n_label_items, log_label_priors

def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key = label_scores.get))
    return result

In [69]:
# Fit the model on the training data
n_label_items, log_label_priors = fit(train_phrases, train_labels, labels)

# Initialize and compute word counts for the training set
vector = CountVectorizer(max_features=3000) 
X_train = vector.fit_transform(train_phrases)
vocab = vector.get_feature_names_out()  # Update vocab for training set

word_counts = {}
for i in labels:  # Labels are [0, 1, 2, 3, 4]
    word_counts[i] = defaultdict(lambda: 0) # account for unseen words  
    
# Populate word counts for each sentiment class
X_train = X_train.toarray()
for j in range(X_train.shape[0]):
    sentiment_class = train_labels[j]
    for h in range(len(vocab)): 
        word_counts[sentiment_class][vocab[h]] += X_train[j][h]

# Predict on the test set
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_phrases)

# Calculate accuracy
print("Accuracy of prediction on test set: ", accuracy_score(test_labels, pred))


Accuracy of prediction on test set:  0.543842112008202
