# Data Import and Preprocessing

In [1]:
import numpy as np
import pandas as pd

1st row is header info <br>
Class labels: 1: positive, -1: negative, 0: neutral, 2: mixed

In [2]:
obama = pd.read_excel('./training-Obama-Romney-tweets.xlsx', usecols="B:E", sheet_name="Obama")
obama.drop(0, inplace=True)
obama.rename(columns={"Anootated tweet": "tweet", "Unnamed: 4": "class"}, inplace=True)

In [4]:
obama.drop(obama.loc[obama["tweet"].isna()].index, inplace=True)
obama.drop(obama.loc[obama["class"].isna()].index, inplace=True) 
obama.drop(obama.loc[obama["class"].isin(['irrelevant', 'irrevelant'])].index, inplace=True) 
obama['class'] = obama['class'].astype(int)
obama.drop_duplicates(inplace=True)
obama = obama.drop(obama.loc[obama["class"] == 2].index)

# Data Analysis

## Class Analysis

In [None]:
obama.loc[obama["class"].isna()]
set(obama["class"].tolist())
obama.loc[obama["class"].isin(['irrelevant', 'irrevelant'])].count()
obama.count()

Since we need to only predict 1, 0, -1 classes, we can drop the rest

In [4]:
obama.drop(obama.loc[obama["tweet"].isna()].index, inplace=True)
obama.drop(obama.loc[obama["class"].isna()].index, inplace=True) 
obama.drop(obama.loc[obama["class"].isin(['irrelevant', 'irrevelant'])].index, inplace=True) 
obama['class'] = obama['class'].astype(int)
obama.drop_duplicates(inplace=True)
obama = obama.drop(obama.loc[obama["class"] == 2].index) 

## Time Analysis

In [36]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
obama.drop(columns=['date', 'time'], inplace=True)

In [6]:
from datetime import datetime

In [30]:
def parse_time(value):
    try:
        # Try parsing ISO 8601 format (e.g., "10:28:53-05:00")
        return pd.to_datetime(value.strip(), format="%H:%M:%S%z", errors='coerce').time()
    except:
        # If it fails, try parsing the alternative format (e.g., "AM 11:9:13")
        try:
            return pd.to_datetime(value.strip(), format="%p %I:%M:%S", errors='coerce').time()
        except:
            return None  # Return None if parsing fails


In [31]:
obama['parsed_time'] = obama['time'].apply(parse_time)
obama = obama.dropna(subset=['parsed_time'])

In [33]:
def time_to_seconds(time_obj):
    return time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second

In [34]:
obama['time_in_seconds'] = obama['parsed_time'].apply(time_to_seconds)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obama['time_in_seconds'] = obama['parsed_time'].apply(time_to_seconds)


In [35]:
obama

Unnamed: 0,date,time,tweet,class,parsed_time,time_in_seconds
1,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,10:28:53,37733
2,2016-12-10 00:00:00,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2,10:09:00,36540
3,2012-10-16 00:00:00,10:04:30-05:00,#<e>obama</e> debates that Cracker Ass Cracker...,1,10:04:30,36270
4,2012-10-16 00:00:00,10:00:36-05:00,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2,10:00:36,36036
5,2012-10-16 00:00:00,09:50:08-05:00,@Hollivan @hereistheanswer Youre missing the ...,0,09:50:08,35408
...,...,...,...,...,...,...
7194,10/17/2012,AM 11:7:09,The Reason <e>Ann Romney</e> And <e>Michelle ...,0,11:07:09,40029
7195,10/17/2012,AM 11:9:13,<e>Obama</e> Kenakan Cincin Syahadat Sejak SM...,0,11:09:13,40153
7196,10/17/2012,AM 11:11:34,"Bitches be like ""Obama<3"" bitches just want <...",0,11:11:34,40294
7197,10/17/2012,AM 11:13:16,<e>president</e> Barack <e>Obama</e> and Repu...,2,11:13:16,40396


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(obama['time_in_seconds'], obama['class'], alpha=0.7)
plt.title("Time vs Class Distribution")
plt.xlabel("Time (seconds since midnight)")
plt.ylabel("Class")
plt.grid(True)

# Format the x-axis to display time in HH:MM format
plt.xticks(
    ticks=range(0, 86401, 3600),  # Seconds in a day with hourly intervals
    labels=[f"{h:02d}:00" for h in range(24)]
)
plt.show()

No coorelation can be extracted from the time component

## Tweet column Analysis

In [96]:
import re

In [99]:
def remove_html_tags(text):
    clean = re.sub(r'<.*?>', '', text)
    return clean

# Apply the function to the 'text' column
obama['tweet'] = obama['tweet'].apply(remove_html_tags)

In [100]:
obama

Unnamed: 0,date,time,tweet,class,parsed_time,time_in_seconds,processed,cleaned_tweet
1,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,10:28:53,37733,"[kirkpatrick, wore, baseball, cap, embroidered...","Kirkpatrick, who wore a baseball cap embroider..."
3,2012-10-16 00:00:00,10:04:30-05:00,#obama debates that Cracker Ass Cracker tonigh...,1,10:04:30,36270,"[e, obama, debates, cracker, ass, cracker, ton...",#obama debates that Cracker Ass Cracker tonigh...
5,2012-10-16 00:00:00,09:50:08-05:00,@Hollivan @hereistheanswer Youre missing the ...,0,09:50:08,35408,"[hollivan, hereistheanswer, youre, missing, po...",@Hollivan @hereistheanswer Youre missing the ...
7,2012-10-16 00:00:00,10:00:16-05:00,I was raised as a Democrat left the party yea...,-1,10:00:16,36016,"[raised, democrat, left, party, years, ago, 19...",I was raised as a Democrat left the party yea...
8,2012-10-16 00:00:00,09:48:07-05:00,The Obama camp can't afford to lower expectati...,0,09:48:07,35287,"[e, obama, camp, ca, afford, lower, expectatio...",The Obama camp can't afford to lower expectati...
...,...,...,...,...,...,...,...,...
7191,10/17/2012,AM 11:2:29,except for women who work in the WH (they mak...,0,11:02:29,39749,"[except, women, work, wh, make, 18, less, hone...",except for women who work in the WH (they mak...
7193,10/17/2012,AM 11:6:19,20 Days to Election & Selection. Elect Lewis...,1,11:06:19,39979,"[20, days, election, selection, elect, lewis, ...",20 Days to Election & Selection. Elect Lewis...
7194,10/17/2012,AM 11:7:09,The Reason Ann Romney And Michelle Obama Matc...,0,11:07:09,40029,"[reason, e, ann, romney, e, michelle, obama, m...",The Reason Ann Romney And Michelle Obama Matc...
7195,10/17/2012,AM 11:9:13,Obama Kenakan Cincin Syahadat Sejak SMA? http...,0,11:09:13,40153,"[e, obama, kenakan, cincin, syahadat, sejak, s...",Obama Kenakan Cincin Syahadat Sejak SMA? http...


Removing the html tags does not seem to impact the accuracy of the NBC

# Load Training Examples

In [50]:
# test = pd.read_excel('sample-testdata.xlsx', usecols="B:E", sheet_name="Obama")
test = pd.read_excel('./training-Obama-Romney-tweets.xlsx', usecols="B:E", sheet_name="Romney")

In [64]:
# test.drop(0, inplace=True)
# test.rename(columns={"Anootated tweet": "tweet", "Unnamed: 4": "class"}, inplace=True)
# test.drop(columns=['date', 'time'], inplace=True)
# test.drop(test.loc[test["tweet"].isna()].index, inplace=True)
# test.drop(test.loc[test["class"].isna()].index, inplace=True) 
# test.drop(test.loc[test["class"].isin(['irrelevant', 'irrevelant'])].index, inplace=True) 
# test['class'] = test['class'].astype(int)
# test.drop(test.loc[test["class"].isin(['!!!!', 'IR'])].index, inplace=True) 
# test.drop_duplicates(inplace=True)
test = test.drop(test.loc[test["class"] == 2].index)

In [101]:
test['tweet'] = test['tweet'].apply(remove_html_tags)

# Model Analysis

## NBC using NLTK

In [102]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify.util import accuracy

In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [103]:
stop_words = set(stopwords.words('english'))

In [104]:
def preprocess(text):
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    return [word for word in words if word.isalnum() and word not in stop_words] 

def extract_features(words):
    return {word: True for word in words}  # Map each word to True

In [105]:
obama['processed'] = obama['tweet'].apply(preprocess)
train_features = [
    (extract_features(row['processed']), row['class'])
    for _, row in obama.iterrows()
]

In [106]:
test['processed'] = test['tweet'].apply(preprocess)
test_features = [
    (extract_features(row['processed']), row['class'])
    for _, row in test.iterrows()
]

In [107]:
classifier = NaiveBayesClassifier.train(train_features)

# Evaluate accuracy
print("Accuracy:", accuracy(classifier, test_features))

# Classify new tweet
# new_tweet = "This service makes me so happy!"
# processed_tweet = preprocess(new_tweet)
# print("Prediction:", classifier.classify(extract_features(processed_tweet)))

Accuracy: 0.4453900709219858


## NBC with NLTK using Lidstone smoothing

In [108]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.probability import LidstoneProbDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify.util import accuracy

In [109]:
stop_words = set(stopwords.words('english'))

def preprocess(text):
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalnum() and word not in stop_words]

def extract_features(words):
    return {word: True for word in words}

def train_with_lidstone(train_data, lidstone_lambda=0.1):
    # Function to return Lidstone probability distribution
    def lidstone_pdist(freqdist, bins):
        return LidstoneProbDist(freqdist, lidstone_lambda, bins)

    return NaiveBayesClassifier.train(train_data, estimator=lidstone_pdist)

In [110]:
obama['processed'] = obama['tweet'].apply(preprocess)
train_features = [
    (extract_features(row['processed']), row['class'])
    for _, row in obama.iterrows()
]

test['processed'] = test['tweet'].apply(preprocess)
test_features = [
    (extract_features(row['processed']), row['class'])
    for _, row in test.iterrows()
]

In [None]:
classifier = train_with_lidstone(obama, lidstone_lambda=0.1)

# Evaluate accuracy
print("Accuracy:", accuracy(classifier, test))

Above code did not work, trying NBC lidstone smoothing with scikit learn

In [113]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [114]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(obama['tweet'])  # Convert text to feature vectors
y = obama['class']  # Labels

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [116]:
alpha = 0.1  # Lidstone smoothing equivalent
classifier = MultinomialNB(alpha=alpha)
classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.57


In [117]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

          -1       0.59      0.58      0.58       416
           0       0.56      0.42      0.48       501
           1       0.55      0.72      0.63       433

    accuracy                           0.57      1350
   macro avg       0.57      0.57      0.56      1350
weighted avg       0.57      0.57      0.56      1350



Trying different lidstone constants

In [128]:
alpha = 1.9 # Lidstone smoothing equivalent
classifier = MultinomialNB(alpha=alpha)
classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.60

Classification Report:
              precision    recall  f1-score   support

          -1       0.59      0.58      0.58       416
           0       0.58      0.56      0.57       501
           1       0.62      0.65      0.64       433

    accuracy                           0.60      1350
   macro avg       0.60      0.60      0.60      1350
weighted avg       0.59      0.60      0.60      1350

