# Libraries

In [53]:
import numpy as np 
import pandas as pd 
from statistics import *
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

from collections import defaultdict

import math

import re


# Import Training Data

In [138]:
import chardet

file_path = '../data/train.csv'
# Detect file encoding
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())
    detected_encoding = result['encoding']
    print(f"Detected encoding: {detected_encoding}")

# Read the CSV file using the detected encoding
try:
    df = pd.read_csv(file_path, encoding=detected_encoding)
    print("DataFrame read with detected encoding:")
    print(df.head())
except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError with detected encoding: {e}")

Detected encoding: Windows-1252
DataFrame read with detected encoding:
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  

In [139]:
#Drop columns with Nan text
df_train_text = df['text']
mask = df_train_text.apply(lambda x: isinstance(x, float))
rows_with_float_text = df[mask] 
indices = rows_with_float_text.index.tolist()
df_train = df.drop(indices)

# Importing Test Data

In [29]:
file_path = '../data/test.csv'
# Detect file encoding
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())
    detected_encoding = result['encoding']
    print(f"Detected encoding: {detected_encoding}")

# Read the CSV file using the detected encoding
try:
    df_test = pd.read_csv(file_path, encoding=detected_encoding)
    print("DataFrame read with detected encoding:")
    print(df_test.head())
except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError with detected encoding: {e}")

Detected encoding: Windows-1252
DataFrame read with detected encoding:
       textID                                               text sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh   neutral   
1  96d74cb729   Shanghai is also really exciting (precisely -...  positive   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...  negative   
3  01082688c6                                        happy bday!  positive   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!  positive   

  Time of Tweet Age of User      Country  Population -2020  Land Area (Km²)  \
0       morning        0-20  Afghanistan        38928346.0         652860.0   
1          noon       21-30      Albania         2877797.0          27400.0   
2         night       31-45      Algeria        43851044.0        2381740.0   
3       morning       46-60      Andorra           77265.0            470.0   
4          noon       60-70       Angola        32866272.0       

In [30]:
#Drop columns with Nan text
df_test_text = df_test['text']
mask = df_test_text.apply(lambda x: isinstance(x, float))
rows_with_float_text = df_test[mask] 
indices = rows_with_float_text.index.tolist()
df_test = df_test.drop(indices)

# Helpers

In [56]:
def split_data_by_sentiment(data, sentiment):
    return data[data['sentiment'] == sentiment]['text'].tolist()

# Assuming df is your DataFrame containing 'text' and 'sentiment' columns
positive_data = split_data_by_sentiment(df, 'positive')
negative_data = split_data_by_sentiment(df, 'negative')
neutral_data = split_data_by_sentiment(df, 'neutral')

In [60]:
def preprocess_tweet(tweet):
    
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'#\w+', '', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'\W+', ' ', tweet)
    
    # Convert the tweet to lowercase
    tweet = tweet.lower()
    
    # Remove punctuation from the tweet using translation
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize the tweet into individual words
    tokens = nltk.word_tokenize(tweet)
    
    # Initialize a Porter stemmer for word stemming
    stemmer = PorterStemmer()
    
    # Get a set of English stopwords from NLTK
    stopwords_set = set(stopwords.words("english"))
    
    # Apply stemming to each token and filter out stopwords
    tokens = [stemmer.stem(token) for token in tokens if token not in stopwords_set]
    
    # Return the preprocessed tokens
    return tokens

In [61]:
def calculate_word_counts(tweets):
    # Initialize a defaultdict to store word counts, defaulting to 0 for unseen words
    word_count = defaultdict(int)
    
    # Iterate through each tweet in the given list of tweets
    for tweet in tweets:
        # Tokenize and preprocess the tweet using the preprocess_tweet function
        tokens = preprocess_tweet(tweet)
        
        # Iterate through each token in the preprocessed tokens
        for token in tokens:
            # Increment the count for the current token in the word_count dictionary
            word_count[token] += 1
    
    # Return the word_count dictionary containing word frequencies
    return word_count

In [62]:
# Calculate word counts for tweets with positive sentiment
word_count_positive = calculate_word_counts(df_train[df_train['sentiment'] == 'positive']['text'])

# Calculate word counts for tweets with negative sentiment
word_count_negative = calculate_word_counts(df_train[df_train['sentiment'] == 'negative']['text'])

# Calculate word counts for tweets with neutral sentiment
word_count_neutral = calculate_word_counts(df_train[df_train['sentiment'] == 'neutral']['text'])

In [63]:
def calculate_likelihood(word_count, total_words, laplacian_smoothing=1):
    # Create an empty dictionary to store the likelihood values
    likelihood = {}
    
    # Get the number of unique words in the vocabulary
    vocabulary_size = len(word_count)

    # Iterate through each word and its corresponding count in the word_count dictionary
    for word, count in word_count.items():
        # Calculate the likelihood using Laplacian smoothing formula
        # Laplacian smoothing is used to handle unseen words in training data
        # The formula is (count + smoothing) / (total_words + smoothing * vocabulary_size)
        likelihood[word] = (count + laplacian_smoothing) / (total_words + laplacian_smoothing * vocabulary_size)

    # Return the calculated likelihood dictionary
    return likelihood

In [64]:
likelihood_positive = calculate_likelihood(word_count_positive, df.shape[0]-1, laplacian_smoothing=1)
likelihood_negative = calculate_likelihood(word_count_negative, df.shape[0]-1, laplacian_smoothing=1)
likelihood_neutral = calculate_likelihood(word_count_neutral, df.shape[0]-1, laplacian_smoothing=1)

In [65]:
likelihood_positive
sorted_dict_positive = dict(sorted(likelihood_positive.items(), key=lambda item: item[1], reverse=True))

In [66]:
likelihood_negative
sorted_dict_negative = dict(sorted(likelihood_negative.items(), key=lambda item: item[1], reverse=True))

In [67]:
likelihood_neutral
sorted_dict_neutral = dict(sorted(likelihood_neutral.items(), key=lambda item: item[1], reverse=True))

In [68]:
def calculate_log_prior(sentiment, data):
    # Calculate the natural logarithm of the ratio of tweets with the specified sentiment to the total number of tweets
    log_prior = math.log(len(data[data['sentiment'] == sentiment]) / len(data))
    
    # Return the calculated log prior
    return log_prior

# Calculate the log prior for tweets with positive sentiment
log_prior_positive = calculate_log_prior('positive', df)

# Calculate the log prior for tweets with negative sentiment
log_prior_negative = calculate_log_prior('negative', df)

# Calculate the log prior for tweets with neutral sentiment
log_prior_neutral = calculate_log_prior('neutral', df)

In [69]:
# Create a dictionary of log-likelihood values for positive sentiment
log_likelihood_positive = {word: math.log(prob) for word, prob in likelihood_positive.items()}

# Create a dictionary of log-likelihood values for negative sentiment
log_likelihood_negative = {word: math.log(prob) for word, prob in likelihood_negative.items()}

# Create a dictionary of log-likelihood values for neutral sentiment
log_likelihood_neutral = {word: math.log(prob) for word, prob in likelihood_neutral.items()}

In [70]:
def classify_tweet_with_scores(tweet, log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
                               log_prior_positive, log_prior_negative, log_prior_neutral):
    # Tokenize and preprocess the input tweet
    tokens = preprocess_tweet(tweet)

    # Calculate the log scores for each sentiment category
    log_score_positive = log_prior_positive + sum([log_likelihood_positive.get(token, 0) for token in tokens])
    log_score_negative = log_prior_negative + sum([log_likelihood_negative.get(token, 0) for token in tokens])
    log_score_neutral = log_prior_neutral + sum([log_likelihood_neutral.get(token, 0) for token in tokens])

    # Store the sentiment scores in a dictionary
    sentiment_scores = {
        'positive': log_score_positive,
        'negative': log_score_negative,
        'neutral': log_score_neutral
    }

    # Determine the predicted sentiment based on the highest sentiment score
    predicted_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    
    # Return the predicted sentiment and the sentiment scores
    return predicted_sentiment #, sentiment_scores

# Classification on Test DataSet

In [71]:
loc_test = []
for tweet in df_test['text']:
    result = classify_tweet_with_scores(tweet,log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
                               log_prior_positive, log_prior_negative, log_prior_neutral)
    loc_test.append(result)

In [72]:
# Load Ground Truth
test_gt = df_test['sentiment'].tolist()

In [73]:
# Calculate the accuracy
correct_count = sum(1 for predicted, actual in zip(loc_test, test_gt) if predicted == actual)
accuracy = correct_count / len(loc_test)

print("Classification accuracy:", accuracy)

Classification accuracy: 0.5212224108658744


# Classification on Train DataSet

In [74]:
loc_train = []
for tweet in df_train['text']:
    result = classify_tweet_with_scores(tweet,log_likelihood_positive, log_likelihood_negative, log_likelihood_neutral,
                               log_prior_positive, log_prior_negative, log_prior_neutral)
    loc_train.append(result)

In [75]:
train_gt = df_train['sentiment'].tolist()

In [76]:
# Calculate the accuracy

correct_count = sum(1 for predicted, actual in zip(loc_train, train_gt) if predicted == actual)
accuracy = correct_count / len(loc_train)

print("Classification accuracy:", accuracy)

Classification accuracy: 0.3347161572052402


# Pre-built Naive Bayes

In [140]:
def cleaning(text):        
    # converting to lowercase, removing URL links, special characters, punctuations...
    text = text.lower() # converting to lowercase
    text = re.sub('https?://\S+|www\.\S+', '', text) # removing URL links
    text = re.sub(r"\b\d+\b", "", text) # removing number 
    text = re.sub('<.*?>+', '', text) # removing special characters, 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # punctuations
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)
   
    #removing emoji: 
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)   

   # removing short form: 
    
    text=re.sub("isn't",'is not',text)
    text=re.sub("he's",'he is',text)
    text=re.sub("wasn't",'was not',text)
    text=re.sub("there's",'there is',text)
    text=re.sub("couldn't",'could not',text)
    text=re.sub("won't",'will not',text)
    text=re.sub("they're",'they are',text)
    text=re.sub("she's",'she is',text)
    text=re.sub("There's",'there is',text)
    text=re.sub("wouldn't",'would not',text)
    text=re.sub("haven't",'have not',text)
    text=re.sub("That's",'That is',text)
    text=re.sub("you've",'you have',text)
    text=re.sub("He's",'He is',text)
    text=re.sub("what's",'what is',text)
    text=re.sub("weren't",'were not',text)
    text=re.sub("we're",'we are',text)
    text=re.sub("hasn't",'has not',text)
    text=re.sub("you'd",'you would',text)
    text=re.sub("shouldn't",'should not',text)
    text=re.sub("let's",'let us',text)
    text=re.sub("they've",'they have',text)
    text=re.sub("You'll",'You will',text)
    text=re.sub("i'm",'i am',text)
    text=re.sub("we've",'we have',text)
    text=re.sub("it's",'it is',text)
    text=re.sub("don't",'do not',text)
    text=re.sub("that´s",'that is',text)
    text=re.sub("I´m",'I am',text)
    text=re.sub("it’s",'it is',text)
    text=re.sub("she´s",'she is',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('I’m','I am',text)
    text=re.sub('I’d','I did',text)
    text=re.sub("he’s'",'he is',text)
    text=re.sub('there’s','there is',text)
    
     
    return text
    
dt_train_cleaned = df_train['text'].apply(cleaning)

In [141]:
dt_train_cleaned = pd.DataFrame(dt_train_cleaned)  
dt_train_cleaned['sentiment']=df_train['sentiment']

In [142]:
stop_words = stopwords.words()
dt_train_cleaned['no_sw'] = dt_train_cleaned['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [144]:
dt_train_cleaned['no_sw']

0                                             id responded
1                                  sooo sad miss san diego
2                                            boss bullying
3                                          interview leave
4                                      put releases bought
                               ...                        
27476                  wish denver husband lost job afford
27477    ive wondered rake client made clear net force ...
27478         yay enjoy break probably hectic weekend xxxx
27479                                                worth
27480                         flirting atg smiles yay hugs
Name: no_sw, Length: 27480, dtype: object

In [145]:
dt_train_cleaned.sentiment = [0 if each == "negative" else 1 if each == "positive" else 2 for each in df_train.sentiment]
tokenized_review=dt_train_cleaned['no_sw'].apply(lambda x: x.split())

In [147]:
dt_train_cleaned

Unnamed: 0,text,sentiment,no_sw
0,id have responded if i were going,2,id responded
1,sooo sad i will miss you here in san diego,0,sooo sad miss san diego
2,my boss is bullying me,0,boss bullying
3,what interview leave me alone,0,interview leave
4,sons of why couldnt they put them on the rel...,0,put releases bought
...,...,...,...
27476,wish we could come see u on denver husband l...,0,wish denver husband lost job afford
27477,ive wondered about rake to the client has ma...,0,ive wondered rake client made clear net force ...
27478,yay good for both of you enjoy the break you...,1,yay enjoy break probably hectic weekend xxxx
27479,but it was worth it,1,worth


In [146]:
tokenized_review

0                                          [id, responded]
1                            [sooo, sad, miss, san, diego]
2                                         [boss, bullying]
3                                       [interview, leave]
4                                  [put, releases, bought]
                               ...                        
27476           [wish, denver, husband, lost, job, afford]
27477    [ive, wondered, rake, client, made, clear, net...
27478    [yay, enjoy, break, probably, hectic, weekend,...
27479                                              [worth]
27480                   [flirting, atg, smiles, yay, hugs]
Name: no_sw, Length: 27480, dtype: object

In [149]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(dt_train_cleaned['no_sw'])



In [155]:
from sklearn.model_selection import train_test_split
X=text_counts
y=dt_train_cleaned['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=21)

In [156]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB = ComplementNB()
CNB.fit(X_train, y_train)

from sklearn import metrics
predicted = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))

ComplementNB model accuracy is 59.35%
------------------------------------------------
Confusion Matrix:
      0     1    2
0  1107   167  319
1   207  1231  259
2   702   580  924
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.69      0.61      1593
           1       0.62      0.73      0.67      1697
           2       0.62      0.42      0.50      2206

    accuracy                           0.59      5496
   macro avg       0.60      0.61      0.59      5496
weighted avg       0.60      0.59      0.58      5496

