# Importing Libraries

In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import regex as re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import seaborn as sns
import csv
from chardet import detect
from spellchecker import SpellChecker
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Loading all Datasets

In [19]:
dataset_pre = pd.read_csv('Datasets/PreElection.csv', encoding='utf-8')

In [20]:
dataset_dr = pd.read_csv('Datasets/DuringElection.csv', encoding='utf-8')

In [21]:
dataset_post = pd.read_csv('Datasets/PostElection.csv', encoding='utf-8')

# Adding Election Period Labels by Adding Another Column

In [22]:
dataset_pre['Period'] = "PRE"


In [23]:
dataset_dr['Period'] = "DUR"

In [24]:
dataset_post['Period'] = "POST"

# Cleaning Datasets

In [25]:
#Concatenate all datasets
dataset = pd.concat([dataset_pre, dataset_dr, dataset_post], ignore_index=True)

In [26]:
dataset.shape

(16643, 4)

In [27]:
dataset_dr.isna().sum()

Tweet       0
Username    0
Date        0
Period      0
dtype: int64

In [32]:
#1 Drop unnecessary columns

dataset.drop(['Date','Username', 'Date'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Period
0,A year ago! Any progress from the winning cand...,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,PRE
...,...,...
16638,Dictator Ferdinand Marcos from the Philippines...,POST
16639,This election rewrites history. People voted f...,POST
16640,"Hey, it’s a free country. No one has monopoly ...",POST
16641,HALA-lan. \n\n#Election2022PH,POST


In [33]:
#Get the exact duplicate tweets in the dataframe
print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 3085


In [34]:
dups = dataset["Tweet"]
dups_listed = dataset[dups.isin(dups[dups.duplicated()])].sort_values("Tweet")
print(dups_listed)

                                                   Tweet Period
4578     @biancadava\rLOOK: Members of the deaf commu...    PRE
3546     @biancadava\rLOOK: Members of the deaf commu...    PRE
3681     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...    PRE
4212     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...    PRE
3572     @nujp\rThis elections, never forget those wh...    PRE
...                                                  ...    ...
3790   🚨HALALAN 2022 ADVICE🚨\rplease! wag kayong papa...    PRE
11309  🤠Alrite let's discuss the Top 10-14 candidates...   POST
15021  🤠Alrite let's discuss the Top 10-14 candidates...   POST
110    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...    PRE
784    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...    PRE

[5406 rows x 2 columns]


In [35]:
# 2 Remove Exact Duplicate Tweets
# Drop the last duplicate and keep the first one
# Reset column index

dataset.drop_duplicates(subset='Tweet', keep="first", inplace=True)
dataset = dataset.reset_index()

In [36]:
# Double check if duplicate tweets were dropped

print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 0


In [37]:
# Drop old index column
dataset.drop(['index'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Period
0,A year ago! Any progress from the winning cand...,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,PRE
...,...,...
13553,Dictator Ferdinand Marcos from the Philippines...,POST
13554,This election rewrites history. People voted f...,POST
13555,"Hey, it’s a free country. No one has monopoly ...",POST
13556,HALA-lan. \n\n#Election2022PH,POST


In [38]:
dataset['Tweet'] = dataset['Tweet'].str.replace("#", "hashtag")
dataset

Unnamed: 0,Tweet,Period
0,A year ago! Any progress from the winning cand...,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,PRE
...,...,...
13553,Dictator Ferdinand Marcos from the Philippines...,POST
13554,This election rewrites history. People voted f...,POST
13555,"Hey, it’s a free country. No one has monopoly ...",POST
13556,HALA-lan. \n\nhashtagElection2022PH,POST


In [39]:
dataset = dataset.astype(str)

In [40]:
# 5 Remove non-alphanumeric characters, spaces & links
def clean_text(text):
    text = text.replace('?', ' ').replace('!', ' ') # Remove question marks, exclamation points
    text = ' '.join([word for word in text.split() if not word.startswith('@')]) # Remove mention tags
    text = text.replace('\n', ' ') # Remove newline characters

    #Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove all non-alphanumeric characters except hashtags, underscores, colons, and spaces
    text = re.sub(r'[^a-zA-Z0-9_#\s]', '', text)

# Remove words containing "http" and "https"
    text = ' '.join([word for word in text.split() if not re.search(r'https', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'http', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'youtube', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'vanBF1_BCyyo', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'story_fbid', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'listOLAK5uy', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'edition_id', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'News5GVGregorio', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'utm_', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'fbclid', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p23', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p24', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p25', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'category_', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'v_DUc7', word)])
    
    text = text.replace('...', ' ') # Remove ellipses
    text = ' '.join(text.split()) # Replace multiple spaces with a single space
    text = text.strip() # Remove leading/trailing spaces

    return text
dataset['Tweet'] = dataset['Tweet'].apply(clean_text)
print(dataset)

                                                   Tweet Period
0      A year ago Any progress from the winning candi...    PRE
1      Bongbong Marcos Sara Duterte express deepest g...    PRE
2      I decide I only vote for BBM red_heart SARA DU...    PRE
3      Mr Bbm and Ms Sara Duterte are obviously will ...    PRE
4      THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...    PRE
...                                                  ...    ...
13553  Dictator Ferdinand Marcos from the Philippines...   POST
13554  This election rewrites history People voted fo...   POST
13555  Hey its a free country No one has monopoly of ...   POST
13556                      HALAlan hashtagElection2022PH   POST
13557  Cheating malalim na sugat kahit mahilom mahira...   POST

[13558 rows x 2 columns]


In [41]:
# Check for Duplicate Tweets after cleaning of non-alphanumeric characters, spaces & links
print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 271


In [42]:
dups2 = dataset["Tweet"]
dups_listed2 = dataset[dups2.isin(dups2[dups2.duplicated()])].sort_values("Tweet")
print(dups_listed2)

                                                   Tweet Period
3862   10 sample ballots were shaded Some had undervo...    PRE
600    10 sample ballots were shaded Some had undervo...    PRE
12039  2022 PH Election Mandaluyong City 12 Similar t...   POST
8672   2022 PH Election Mandaluyong City 12 Similar t...   POST
8318   23 Interestingly Leni had her worst performanc...   POST
...                                                  ...    ...
3237   high_voltage The Philippine election is just d...    PRE
4992   libreng sakay for everyone here at Marilao Bul...    DUR
4697   libreng sakay for everyone here at Marilao Bul...    DUR
10637  oh why oh why when can we ever learn and rejec...   POST
9057   oh why oh why when can we ever learn and rejec...   POST

[511 rows x 2 columns]


In [43]:
# Drop the last duplicate and keep the first one
# Reset column index

dataset.drop_duplicates(subset='Tweet', keep="first", inplace=True)
dataset = dataset.reset_index()

In [44]:
# Drop old index column 
dataset.drop(['index'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Period
0,A year ago Any progress from the winning candi...,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,PRE
2,I decide I only vote for BBM red_heart SARA DU...,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,PRE
...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,POST
13283,This election rewrites history People voted fo...,POST
13284,Hey its a free country No one has monopoly of ...,POST
13285,HALAlan hashtagElection2022PH,POST


In [45]:
def add_colons(text):
    # Use a regular expression to find words with underscores
    pattern = r'\b(\w+)(?:_(\w+))+\b'
    
    # Define a function for substitution
    def replace(match):
        return f':{match.group(1)}_{match.group(2)}:'

    # Use re.sub to perform the substitution
    modified_text = re.sub(pattern, replace, text)
    
    return modified_text

In [46]:
dataset['Tweet'] = dataset['Tweet'].apply(add_colons)
dataset

Unnamed: 0,Tweet,Period
0,A year ago Any progress from the winning candi...,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,PRE
2,I decide I only vote for BBM :red_heart: SARA ...,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,PRE
...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,POST
13283,This election rewrites history People voted fo...,POST
13284,Hey its a free country No one has monopoly of ...,POST
13285,HALAlan hashtagElection2022PH,POST


In [47]:
# 3 Convert text back to emojis
def convert_to_emojis(text):
    return emoji.emojize(text)
dataset['Tweet'] = dataset['Tweet'].apply(convert_to_emojis)
dataset

Unnamed: 0,Tweet,Period
0,A year ago Any progress from the winning candi...,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,PRE
2,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,PRE
...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,POST
13283,This election rewrites history People voted fo...,POST
13284,Hey its a free country No one has monopoly of ...,POST
13285,HALAlan hashtagElection2022PH,POST


It was seen that there were some unicode / textualized emojis not converted into emojis. Through checking documentation provided by the owner of the emoji package, it was seen that the formatting for the selected emojis are different than what was stated in this webpage (https://carpedm20.github.io/emoji/). Therefore, through the creation of a dictionary, the researchers placed the textualized emojis not converted into one column, and their form in the other. Other words not converting to emojis were also added. As well as underscores that were mistakenly assumed as emojis.

Some emojis were converted into their basic form without specification for skin color and to alternatives. Due to difficulties with characters & transformation.

In [48]:
# Create Missing Emojis Dictionary
missing_dict = {}
missing = pd.read_csv('UpdatedEmojis.csv', encoding='utf-8')
with open('UpdatedEmojis.csv', mode='r') as inp:
    reader = csv.reader(inp)
    headers = next(reader)
    missing_dict = {rows[0]:rows[1] for rows in reader}
missing_dict

{':backhand_index_pointing_down_mediumdark_skin_tone:': ':backhand_index_pointing_down:',
 ':backhand_index_pointing_down_mediumlight_skin_tone:': ':backhand_index_pointing_down_medium-light_skin_tone:',
 ':clapping_hands_mediumlight_skin_tone:': ':clapping_hands_medium-light_skin_tone:',
 ':DepEd_PH:': 'DepEd_PH',
 ':flexed_biceps_mediumlight_skin_tone:': ':flexed_biceps:',
 ':folded_hands_mediumdark_skin_tone:': ':folded_hands_medium-dark_skin_tone:',
 ':folded_hands_mediumlight_skin_tone:': ':folded_hands_medium-light_skin_tone:',
 ':globe_showing_AsiaAustralia:': ':globe_showing_Asia-Australia:',
 ':globe_showing_EuropeAfrica:': ':globe_showing_Europe-Africa:',
 ':hand_with_index_finger_and_thumb_crossed_mediumlight_skin_tone:': ':hand_with_index_finger_and_thumb_crossed_medium-light_skin_tone:',
 ':hashtagELEKSYON_DISCUSSION:': 'hashtagELEKSYON_DISCUSSION',
 ':hashtagFinney_Smith:': 'hashtagFinney_Smith',
 ':hashtagmartial_law:': 'hashtagmartial_law',
 ':hashtagTCPH_ANNOUNCEMENT:'

In [49]:
def convert_missing(text):
    for text_emoji, unicode_emoji in missing_dict.items():
        text = text.replace(text_emoji, unicode_emoji)
    return text

In [50]:
dataset['Tweet'] = dataset['Tweet'].apply(convert_missing)
dataset

Unnamed: 0,Tweet,Period
0,A year ago Any progress from the winning candi...,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,PRE
2,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,PRE
...,...,...
13282,Dictator Ferdinand Marcos from the :Philippine...,POST
13283,This election rewrites history People voted fo...,POST
13284,Hey its a free country No one has monopoly of ...,POST
13285,HALAlan hashtagElection2022PH,POST


In [51]:
dataset['Tweet'] = dataset['Tweet'].apply(convert_to_emojis)
dataset

Unnamed: 0,Tweet,Period
0,A year ago Any progress from the winning candi...,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,PRE
2,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,PRE
...,...,...
13282,Dictator Ferdinand Marcos from the 🇵🇭 is featu...,POST
13283,This election rewrites history People voted fo...,POST
13284,Hey its a free country No one has monopoly of ...,POST
13285,HALAlan hashtagElection2022PH,POST


In [52]:
def remove_standalone_underscores(text):
    # Use a regular expression to replace standalone underscores
    modified_text = re.sub(r'\b_\b', '', text)
    return modified_text

In [53]:
dataset['Tweet'] = dataset['Tweet'].apply(remove_standalone_underscores)

# Add Labels

In [54]:
dataset['Tweets for Form'] = dataset['Tweet']

In [55]:
def insert_label(row):
    period = row['Period']
    text = row['Tweet']

    if period == 'PRE':
        return 'PRE - ' + text
    elif period == 'DUR':
        return 'DUR - ' + text
    elif period == 'POST':
        return 'POST - ' + text
    
    return text

dataset['Tweet'] = dataset.apply(insert_label, axis=1)
dataset

Unnamed: 0,Tweet,Period,Tweets for Form
0,PRE - A year ago Any progress from the winning...,PRE,A year ago Any progress from the winning candi...
1,PRE - Bongbong Marcos Sara Duterte express dee...,PRE,Bongbong Marcos Sara Duterte express deepest g...
2,PRE - I decide I only vote for BBM ❤️ SARA DUT...,PRE,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...
3,PRE - Mr Bbm and Ms Sara Duterte are obviously...,PRE,Mr Bbm and Ms Sara Duterte are obviously will ...
4,PRE - THE CROWD IS INSANE BBM YOULL FOREVER BE...,PRE,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...
...,...,...,...
13282,POST - Dictator Ferdinand Marcos from the 🇵🇭 i...,POST,Dictator Ferdinand Marcos from the 🇵🇭 is featu...
13283,POST - This election rewrites history People v...,POST,This election rewrites history People voted fo...
13284,POST - Hey its a free country No one has monop...,POST,Hey its a free country No one has monopoly of ...
13285,POST - HALAlan hashtagElection2022PH,POST,HALAlan hashtagElection2022PH


In [56]:
del dataset['Period']

In [57]:
dataset

Unnamed: 0,Tweet,Tweets for Form
0,PRE - A year ago Any progress from the winning...,A year ago Any progress from the winning candi...
1,PRE - Bongbong Marcos Sara Duterte express dee...,Bongbong Marcos Sara Duterte express deepest g...
2,PRE - I decide I only vote for BBM ❤️ SARA DUT...,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...
3,PRE - Mr Bbm and Ms Sara Duterte are obviously...,Mr Bbm and Ms Sara Duterte are obviously will ...
4,PRE - THE CROWD IS INSANE BBM YOULL FOREVER BE...,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...
...,...,...
13282,POST - Dictator Ferdinand Marcos from the 🇵🇭 i...,Dictator Ferdinand Marcos from the 🇵🇭 is featu...
13283,POST - This election rewrites history People v...,This election rewrites history People voted fo...
13284,POST - Hey its a free country No one has monop...,Hey its a free country No one has monopoly of ...
13285,POST - HALAlan hashtagElection2022PH,HALAlan hashtagElection2022PH


In [59]:
dataset.to_csv('final_labelguide.csv', index=False, encoding='utf-8')