# Dataset preprocessing

## Training dataset

#### Importing necessary libraries

In [1]:
import pandas as pd
import re
import copy
import time
from deep_translator import GoogleTranslator

#### Reading the training dataset

In [2]:
df = pd.read_excel('training.xlsx')
df.head()

Unnamed: 0,userGender,category,comment
0,MALE,2,Balita nyu na si liza marcos at mga ksma nya s...
1,FEMALE,2,babalik ka rin... babalik at babalik karin💪👊👊👊
2,MALE,1,Bi Tay ✈️ (i mean bye Tay👋🏼)
3,MALE,2,Justice system ng pilipinas ay palpak kaya nam...
4,MALE,2,If only the victims of martial law had the sam...


#### Removing emojis and translating the comments to english

In [3]:
# Emoji removal regex (compiled once)
EMOJI_REGEX = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA70-\U0001FAFF"
    "\U00002600-\U000026FF"
    "]+", flags=re.UNICODE
)

# Helper to remove emojis
def remove_emojis(text):
    return EMOJI_REGEX.sub('', text)

translator = GoogleTranslator(source="auto", target="en")
commentsTranslation = []

for i, comment in enumerate(df["comment"], start=1):
    comment = remove_emojis(str(comment)) if isinstance(comment, str) else ""
    translated = ""

    for attempt in range(5):
        try:
            if comment:
                translated = translator.translate(comment)
                print(f"Comment {i} translated successfully.")
            break
        except Exception as e:
            print(f"Translation failed for comment {i} (attempt {attempt + 1}): {e}")
            time.sleep(3)

    if not translated:
        print(f"Translation failed for comment {i} after maximum retries.")

    commentsTranslation.append(translated)

df["commentsTranslation"] = commentsTranslation


Comment 1 translated successfully.
Comment 2 translated successfully.
Comment 3 translated successfully.
Comment 4 translated successfully.
Comment 5 translated successfully.
Comment 6 translated successfully.
Comment 7 translated successfully.
Comment 8 translated successfully.
Translation failed for comment 9 after maximum retries.
Comment 10 translated successfully.
Comment 11 translated successfully.
Comment 12 translated successfully.
Comment 13 translated successfully.
Comment 14 translated successfully.
Comment 15 translated successfully.
Comment 16 translated successfully.
Comment 17 translated successfully.
Comment 18 translated successfully.
Comment 19 translated successfully.
Comment 20 translated successfully.
Comment 21 translated successfully.
Comment 22 translated successfully.
Comment 23 translated successfully.
Comment 24 translated successfully.
Comment 25 translated successfully.
Comment 26 translated successfully.
Comment 27 translated successfully.
Comment 28 trans

#### Deleting empty strings

In [4]:
df = df[df["commentsTranslation"].str.strip() != ""]

#### Saving the dataset to an excel file to manually correct translation errors

In [5]:
df.to_excel('trainingClean.xlsx')

## New comments dataset

#### Reading the new comments dataset

In [6]:
newComments = pd.read_excel('newComments.xlsx')

#### Removing emojis and translating the comments to english

In [7]:
# Emoji removal regex (compiled once)
EMOJI_REGEX = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F1E0-\U0001F1FF"
    "\U00002700-\U000027BF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA70-\U0001FAFF"
    "\U00002600-\U000026FF"
    "]+", flags=re.UNICODE
)

# Helper to remove emojis
def remove_emojis(text):
    return EMOJI_REGEX.sub('', text)

translator = GoogleTranslator(source="auto", target="en")
commentsTranslation = []

for i, comment in enumerate(newComments["comment"], start=1):
    comment = remove_emojis(str(comment)) if isinstance(comment, str) else ""
    translated = ""

    for attempt in range(5):
        try:
            if comment:
                translated = translator.translate(comment)
                print(f"Comment {i} translated successfully.")
            break
        except Exception as e:
            print(f"Translation failed for comment {i} (attempt {attempt + 1}): {e}")
            time.sleep(3)

    if not translated:
        print(f"Translation failed for comment {i} after maximum retries.")

    commentsTranslation.append(translated)

newComments["commentsTranslation"] = commentsTranslation

Comment 1 translated successfully.
Comment 2 translated successfully.
Comment 3 translated successfully.
Comment 4 translated successfully.
Comment 5 translated successfully.
Comment 6 translated successfully.
Comment 7 translated successfully.
Comment 8 translated successfully.
Comment 9 translated successfully.
Comment 10 translated successfully.
Comment 11 translated successfully.
Comment 12 translated successfully.
Comment 13 translated successfully.
Comment 14 translated successfully.
Comment 15 translated successfully.
Comment 16 translated successfully.
Comment 17 translated successfully.
Comment 18 translated successfully.
Comment 19 translated successfully.
Comment 20 translated successfully.
Comment 21 translated successfully.
Comment 22 translated successfully.
Comment 23 translated successfully.
Comment 24 translated successfully.
Comment 25 translated successfully.
Comment 26 translated successfully.
Comment 27 translated successfully.
Comment 28 translated successfully.
C

#### Deleting empty strings

In [8]:
newComments = newComments[newComments["commentsTranslation"].str.strip() != ""]

#### Saving the dataset to an excel file to manually correct translation errors

In [9]:
newComments.to_excel('newCommentsClean.xlsx')

# Model Training

#### Reading the cleaned training dataset

In [10]:
df = pd.read_excel('trainingClean.xlsx')

#### Importing the necessary libraries

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import nltk
from nltk.stem import WordNetLemmatizer

#### Splitting the dataset for training and testing

In [12]:
X = df['commentsTranslation']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state = 101)

#### Lemmatization

In [13]:
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def custom_tokenizer(text):
    tokens = text.split()  # Basic tokenization (replace with more advanced if needed)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return lemmatized_tokens


[nltk_data] Downloading package wordnet to C:\Users\Ivan
[nltk_data]     Padilla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Ivan
[nltk_data]     Padilla\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### Building the pipeline

In [14]:
pipeline = Pipeline([
    ('bow', CountVectorizer(tokenizer = custom_tokenizer, token_pattern = None)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC())
])

#### Using the pipeline on the training dataset

In [15]:
pipeline.fit((X_train),(y_train))

#### Using the trained model to predict the testing dataset

In [16]:
predictions = pipeline.predict(X_test)

#### Evaluating the model

In [17]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[66 12]
 [12 70]]
              precision    recall  f1-score   support

           1       0.85      0.85      0.85        78
           2       0.85      0.85      0.85        82

    accuracy                           0.85       160
   macro avg       0.85      0.85      0.85       160
weighted avg       0.85      0.85      0.85       160



#### Applying Gridsearch to find the best parameters

In [18]:
param_grid = {
    'bow__ngram_range': [(0,1), (1,2), (1,3), (1,4)],
    'tfidf__sublinear_tf': [True, False],
    'classifier__C': [0.1,1,10,100, 1000],
    'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'classifier__gamma': [1,0.1,0.01,0.001, 0.0001]
}

grid = GridSearchCV(pipeline,param_grid,refit=True, verbose = 2, scoring = 'accuracy')

In [19]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classifier__gamma=1, classifier__kernel=linear, tfidf__sublinear_tf=True; total time=   0.0s
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classifier__gamma=1, classifier__kernel=linear, tfidf__sublinear_tf=True; total time=   0.0s
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classifier__gamma=1, classifier__kernel=linear, tfidf__sublinear_tf=True; total time=   0.0s
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classifier__gamma=1, classifier__kernel=linear, tfidf__sublinear_tf=True; total time=   0.0s
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classifier__gamma=1, classifier__kernel=linear, tfidf__sublinear_tf=True; total time=   0.0s
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classifier__gamma=1, classifier__kernel=linear, tfidf__sublinear_tf=False; total time=   0.0s
[CV] END bow__ngram_range=(0, 1), classifier__C=0.1, classi

#### Printing the best parameters, estimator and score

In [20]:
grid.best_params_

{'bow__ngram_range': (1, 3),
 'classifier__C': 10,
 'classifier__gamma': 1,
 'classifier__kernel': 'rbf',
 'tfidf__sublinear_tf': False}

In [21]:
grid.best_estimator_

In [22]:
grid.best_score_

0.8901621621621623

#### Using the model with the best parameters to predict the testing dataset

In [23]:
grid_predict = grid.predict(X_test)

#### Evaluating the model with the best parameters

In [24]:
print(confusion_matrix(y_test, grid_predict))
print(classification_report(y_test, grid_predict))

[[68 10]
 [13 69]]
              precision    recall  f1-score   support

           1       0.84      0.87      0.86        78
           2       0.87      0.84      0.86        82

    accuracy                           0.86       160
   macro avg       0.86      0.86      0.86       160
weighted avg       0.86      0.86      0.86       160



# Using the trained model on unseen data

#### Reading the dataset

In [102]:
df = pd.read_excel('newCommentsClean.xlsx')

#### Using the trained model on the new comments to predict sentiments

In [103]:
predictions = grid.predict(df['commentsTranslation'])

Creating a new column for the predictions, mapping prediction to a string, and getting the length of the original comments

In [104]:
df['predictedSentiment'] = predictions

In [105]:
df['categoricalSentiment'] = df['predictedSentiment'].map({1: 'Positive', 2:'Negative'})

In [106]:
df['commentLength'] = df['commentsTranslation'].apply(len)

In [107]:
df.head()

Unnamed: 0,userGender,comment,commentsTranslation,predictedSentiment,categoricalSentiment,commentLength
0,FEMALE,No one is above the law...,No one is above the law...,1,Positive,26
1,MALE,Si BBM ang Presidente na kayang ibenta ang kap...,BBM is the president who can sell his fellow F...,2,Negative,118
2,FEMALE,BTW WHATS THE NEWS ABOUT LISATANAS? WHY ARE YO...,BTW WHATS THE NEWS ABOUT LISATANAS? WHY ARE YO...,2,Negative,109
3,FEMALE,JUSTICE SA VICTIMS NG LIBO2 NG EJK @ MGA FAMI...,Justice to the thousands of victims of EJK and...,1,Positive,108
4,FEMALE,GMA AYUS AYUSIN MO YANG BALITA MO BAKA IKAW NA...,"GMA, fix your news properly or you might be th...",2,Negative,70


# Analysis

#### Importing necessary plotting libraries

In [153]:
import plotly.io as pio
import plotly.express as px

#### Bar plot of predicted sentiment

In [158]:
categoricalSentiment = df.groupby(['categoricalSentiment']).size().reset_index(name = 'count')

In [159]:
categoricalSentiment

Unnamed: 0,categoricalSentiment,count
0,Negative,269
1,Positive,231


In [258]:
categoryFig = px.bar(categoricalSentiment, 
                     x = 'categoricalSentiment', 
                     y = 'count',  
                     labels = {'categoricalSentiment':'Predicted Sentiment', 'count':'Count'}, 
                     text_auto = True
                     )
categoryFig.show()
categoryFig.write_html('categoryFig.html', full_html = False, include_plotlyjs = 'cdn')

#### Bar plot of predicted sentiment broken down by user gender

In [154]:
categoricalSentimentGender = df.groupby(['userGender','categoricalSentiment']).size().reset_index(name = 'count')

In [155]:
categoricalSentimentGender

Unnamed: 0,userGender,categoricalSentiment,count
0,FEMALE,Negative,138
1,FEMALE,Positive,118
2,MALE,Negative,131
3,MALE,Positive,113


In [259]:
categoryGenderFig = px.bar(categoricalSentimentGender, 
                     x = 'categoricalSentiment', 
                     y = 'count', 
                     color = 'userGender', 
                     labels = {'userGender':'Gender', 'categoricalSentiment':'Predicted Sentiment', 'count':'Count'}, 
                     text_auto = True
                     )
categoryGenderFig.show()
categoryGenderFig.write_html('categoryGenderFig.html', full_html = False, include_plotlyjs = 'cdn')

#### Distribution of comment length

In [260]:
commentLength = px.histogram(df,
                             x = 'commentLength',
                             color = 'categoricalSentiment',
                             nbins = 100,
                             labels = {'count':'Count', 'commentLength':'Comment Length', 'categoricalSentiment': 'Sentiment'})
commentLength.show()
commentLength.write_html('commentLength.html', full_html = False, include_plotlyjs = 'cdn')

In [143]:
df[df['commentLength'] == 4693]['comment'].iloc[0]

'gma takot kayo sa katotohan\n\nDuterte\n\nMe and my family stand for Former President Rodrigo Roa Duterte\n\nI stand being part of this government\n\n"Honestly, our economy is stagnant. If you\'re only in it for personal gain, then step down from your position. You\'re involving innocent civilians in your actions. Our economy is dying, and investors and tourists are losing confidence. Our economy is suffering due to your self-serving politics.\n\nAs ordinary citizens, we have the power to hold you accountable for not doing your job. You\'re misusing your positions for personal gain. It\'s shameful how you\'re abusing the trust placed in you by the people who elected you. We trusted that you would fulfill your duties, but instead, you\'re only interested in lining your pockets. That money doesn\'t belong to you; it belongs to us, the people. Without us, there would be no government officials.\n\nYou\'re disrespecting the people\'s trust and dishonoring the Constitution. We urge the Dep

In [173]:
df[df['categoricalSentiment'] == 'Positive']['commentLength'].describe()

count     231.000000
mean      147.246753
std       187.086250
min         9.000000
25%        41.000000
50%        86.000000
75%       157.000000
max      1032.000000
Name: commentLength, dtype: float64

In [174]:
df[df['categoricalSentiment'] == 'Negative']['commentLength'].describe()

count     269.000000
mean      132.799257
std       377.514274
min         4.000000
25%        42.000000
50%        81.000000
75%       122.000000
max      4693.000000
Name: commentLength, dtype: float64

#### Topic Modeling using Latent Dirichlet Allocation and Non-negative Matrix Factorization

#### Importing necessary libraries

In [231]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


#### Latent Dirichlet Allocation

In [242]:
pipeline = Pipeline([
    ('tf_vectorizer', CountVectorizer(max_df = 0.95, min_df = 2, max_features = 1000, stop_words="english")),
    ('lda', LatentDirichletAllocation(n_components = 10, max_iter = 5, learning_method = 'online', learning_offset = 50.0, random_state = 0))
])

In [243]:
pipeline.fit(df['commentsTranslation'])

In [244]:
# Get feature names from CountVectorizer
feature_names = pipeline.named_steps['tf_vectorizer'].get_feature_names_out()

# Get the topic-word distribution matrix
lda_model = pipeline.named_steps['lda']
topics = lda_model.components_

# Display the top 10 words for each topic
for topic_idx, topic in enumerate(topics):
    top_words = [feature_names[i] for i in topic.argsort()[-10:]]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

Topic 1: haist, paid, post, oh, fprrd, stand, late, bias, gma, media
Topic 2: icc, accountable, held, roa, rodrigo, duterte, day, far, remember, bbm
Topic 3: detained, high, president, right, lords, philippines, drugs, report, lady, drug
Topic 4: just, ejk, served, law, country, duterte, philippines, victims, icc, justice
Topic 5: update, report, lady, arrested, people, liza, duterte, marcos, gma, news
Topic 6: paolo, hear, wanna, thing, karma, story, news, death, report, just
Topic 7: won, love, best, arrest, like, time, god, tatay, digong, president
Topic 8: torre, hiding, away, custodial, arrest, brought, state, warrant, court, icc
Topic 9: resigned, high, police, leadership, disagree, forgiveness, ask, don, wish, arrest
Topic 10: reporting, support, duterte, ug, ng, na, nga, mga, ang, sa


#### Non-negative matrix factorization

In [245]:
pipeline = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = 1000, stop_words = "english")),
    ('nmf', NMF(n_components = 10, random_state = 1, init = 'nndsvda', beta_loss = "frobenius", alpha_W = 0.00005, alpha_H = 0.00005, l1_ratio = 1))
])

In [246]:
pipeline.fit(df['commentsTranslation'])

In [247]:
# Get feature names from CountVectorizer
feature_names = pipeline.named_steps['tfidf_vectorizer'].get_feature_names_out()

# Get the topic-word distribution matrix
nmf_model = pipeline.named_steps['nmf']
topics = nmf_model.components_

# Display the top 10 words for each topic
for topic_idx, topic in enumerate(topics):
    top_words = [feature_names[i] for i in topic.argsort()[-10:]]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

Topic 1: job, man, let, needs, great, ecstatic, long, yes, justice, served
Topic 2: media, shut, reporting, tired, haha, marcos, liza, update, gma, news
Topic 3: arrested, pnp, arrest, interpol, member, court, time, duterte, philippines, icc
Topic 4: grateful, drug, thousands, praying, innocent, families, finally, justice, ejk, victims
Topic 5: paolo, condition, detained, tantoco, fyang, stopped, america, la, lady, report
Topic 6: prove, court, extent, prosecute, opportunity, harsh, innocent, let, rule, law
Topic 7: deserved, thank, country, good, best, pray, forever, love, president, god
Topic 8: warrant, thing, know, knows, happen, going, don, wait, right, just
Topic 9: love, innocent, proven, cases, turned, protect, allah, ready, digong, tatay
Topic 10: human, long, god, innocent, time, finally, truth, let, justice, prevail


# Hypothesis testing

**Null Hypothesis**: <br>
No significant difference between positive and negative comments. <br>
**Alternative Hypothesis:** <br>
There is a significant difference between the positive and negative comments

In [254]:
import scipy.stats as stats
from scipy.stats import chi2

observed = [231, 269]
expected = [250, 250] 

alpha = 0.05
degrees_of_freedom = 1

critical_value = chi2.isf(alpha, degrees_of_freedom)
chi2_stat, p_value = stats.chisquare(observed, expected)

print(f"Chi-square Statistic: {chi2_stat}")
print(f"P-value: {p_value}")
print(f"Critical value: {critical_value}")

Chi-square Statistic: 2.888
P-value: 0.08924164599420346
Critical value: 3.8414588206941285


Since the chi-square statistic is less than the critical value and P-value (0.09) is greater than the alpha level 0.05, we have to accept the null hypothesis.