In [1]:
import re

import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

import math

import nltk

from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict


In [2]:
from datasets import load_dataset

datasetz = load_dataset("carblacac/twitter-sentiment-analysis")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_dataset = datasetz['train']
data = pd.DataFrame(train_dataset)
data

Unnamed: 0,text,feeling
0,@fa6ami86 so happy that salman won. btw the 1...,0
1,@phantompoptart .......oops.... I guess I'm ki...,0
2,@bradleyjp decidedly undecided. Depends on the...,1
3,@Mountgrace lol i know! its so frustrating isn...,1
4,@kathystover Didn't go much of any where - Lif...,1
...,...,...
119983,I so should be in bed but I can't sleep,0
119984,@mickeymab mine's in my profile - '77cb550 and...,1
119985,@stacyreeves Awe... I wish I could. I am here...,0
119986,Is it me or is Vodafone UK business support ru...,0


In [4]:
def remove_tags(string):
    # Remove HTML tags
    result = re.sub(r'<.*?>', '', string)
    # Remove URLs
    result = re.sub(r'https?://\S+', '', result)
    # Remove Twitter usernames
    result = re.sub(r'@[^\s]+', '', result)
    # Remove non-alphanumeric characters (keeping only letters and spaces)
    result = re.sub(r'[^a-zA-Z\s]', ' ', result)
    # Convert to lowercase
    result = result.lower()
    return result


data['text']=data['text'].apply(lambda cw : remove_tags(cw)) 

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


In [5]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['text'] = data.text.apply(lemmatize_text)

In [6]:
data

Unnamed: 0,text,feeling
0,happy salman btw sec clip truely teaser,0
1,oops guess kinda blonde moment blush epic fail,0
2,decidedly undecided depends situation people c...,1
3,lol know frustrating isnt,1
4,go much life took,1
...,...,...
119983,bed sleep,0
119984,mine profile cb hector bmw r photo fb check al...,1
119985,awe wish could weekend wedding crazy busy toni...,0
119986,vodafone uk business support rubbish report pr...,0


In [7]:
reviews = data['text'].values
labels = data['feeling'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)


In [8]:
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = vec.get_feature_names_out()
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]

In [9]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

In [10]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [11]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

In [12]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))

Accuracy of prediction on test set :  0.7438743874387439


In [13]:
# Load the test dataset
test_dataset = datasetz['test']
test_data = pd.DataFrame(test_dataset)

# Apply the same text preprocessing steps to the test data
test_data['text'] = test_data['text'].apply(lambda cw: remove_tags(cw))
test_data['text'] = test_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test_data['text'] = test_data['text'].apply(lemmatize_text)

# Get test sentences and labels
test_reviews = test_data['text'].values
test_labels = encoder.transform(test_data['feeling'].values)

# Predict using the trained model
test_pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_reviews)

# Evaluate the accuracy on the test set
print("Accuracy of prediction on test set: ", accuracy_score(test_labels, test_pred))


Accuracy of prediction on test set:  0.7439594825639537


In [14]:
from sklearn.metrics import classification_report

# Evaluate the accuracy on the test set
accuracy = accuracy_score(test_labels, test_pred)
print("Naive Bayes Classifier:\n")
print(f" Test Set Accuracy: {accuracy}")

# Generate and print the classification report
report = classification_report(test_labels, test_pred, target_names=['Class 0', 'Class 1'])
print(" Classification Report:\n", report)


Naive Bayes Classifier:

 Test Set Accuracy: 0.7439594825639537
 Classification Report:
               precision    recall  f1-score   support

     Class 0       0.72      0.81      0.76     30969
     Class 1       0.78      0.68      0.73     31029

    accuracy                           0.74     61998
   macro avg       0.75      0.74      0.74     61998
weighted avg       0.75      0.74      0.74     61998

