In [29]:
import pandas as pd
import numpy as np
import emoji
import re
import csv

Loading all Datasets

In [2]:
dataset_pre = pd.read_csv('Datasets/PreElection.csv', encoding='utf-8')

In [3]:
dataset_dr = pd.read_csv('Datasets/DuringElection.csv', encoding='utf-8')

In [4]:
dataset_post = pd.read_csv('Datasets/PostElection.csv', encoding='utf-8')

In [5]:
# Adding Election Period Labels by Adding Another Column
dataset_pre['Period'] = "PRE"
dataset_dr['Period'] = "DUR"
dataset_post['Period'] = "POST"

In [6]:
#Concatenate all datasets
dataset = pd.concat([dataset_pre, dataset_dr, dataset_post], ignore_index=True)

In [7]:
#1 Drop unnecessary columns

dataset.drop(['Username'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago! Any progress from the winning cand...,5/5/2022,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",5/4/2022,PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,5/7/2022,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,5/4/2022,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,5/7/2022,PRE
...,...,...,...
16638,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
16639,This election rewrites history. People voted f...,5/28/2022,POST
16640,"Hey, it’s a free country. No one has monopoly ...",5/28/2022,POST
16641,HALA-lan. \n\n#Election2022PH,5/29/2022,POST


In [8]:
#Get the exact duplicate tweets in the dataframe
print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 3085


In [9]:
dups = dataset["Tweet"]
dups_listed = dataset[dups.isin(dups[dups.duplicated()])].sort_values("Tweet")
print(dups_listed)

                                                   Tweet       Date Period
4578     @biancadava\rLOOK: Members of the deaf commu...   5/5/2022    PRE
3546     @biancadava\rLOOK: Members of the deaf commu...   5/5/2022    PRE
3681     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...   5/6/2022    PRE
4212     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...   5/7/2022    PRE
3572     @nujp\rThis elections, never forget those wh...   5/5/2022    PRE
...                                                  ...        ...    ...
3790   🚨HALALAN 2022 ADVICE🚨\rplease! wag kayong papa...   5/5/2022    PRE
11309  🤠Alrite let's discuss the Top 10-14 candidates...  5/28/2022   POST
15021  🤠Alrite let's discuss the Top 10-14 candidates...  5/28/2022   POST
110    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...   5/2/2022    PRE
784    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...   5/2/2022    PRE

[5406 rows x 3 columns]


In [10]:
# 2 Remove Exact Duplicate Tweets
# Drop the last duplicate and keep the first one
# Reset column index

dataset.drop_duplicates(subset='Tweet', keep="first", inplace=True)
dataset = dataset.reset_index()

In [11]:
# Double check if duplicate tweets were dropped

print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 0


In [12]:
# Drop old index column
dataset.drop(['index'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago! Any progress from the winning cand...,5/5/2022,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",5/4/2022,PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,5/7/2022,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,5/4/2022,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,5/7/2022,PRE
...,...,...,...
13553,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13554,This election rewrites history. People voted f...,5/28/2022,POST
13555,"Hey, it’s a free country. No one has monopoly ...",5/28/2022,POST
13556,HALA-lan. \n\n#Election2022PH,5/29/2022,POST


In [13]:
dataset['Tweet'] = dataset['Tweet'].str.replace("#", "hashtag")
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago! Any progress from the winning cand...,5/5/2022,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",5/4/2022,PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,5/7/2022,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,5/4/2022,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,5/7/2022,PRE
...,...,...,...
13553,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13554,This election rewrites history. People voted f...,5/28/2022,POST
13555,"Hey, it’s a free country. No one has monopoly ...",5/28/2022,POST
13556,HALA-lan. \n\nhashtagElection2022PH,5/29/2022,POST


In [15]:
dataset['Tweet'] = dataset["Tweet"].astype(str)

In [20]:
# 5 Remove non-alphanumeric characters, spaces & links
def clean_text(text):
    text = text.replace('?', ' ').replace('!', ' ') # Remove question marks, exclamation points
    text = ' '.join([word for word in text.split() if not word.startswith('@')]) # Remove mention tags
    text = text.replace('\n', ' ') # Remove newline characters

    #Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove all non-alphanumeric characters except hashtags, underscores, colons, and spaces
    text = re.sub(r'[^a-zA-Z0-9_#\s]', '', text)

# Remove words containing "http" and "https"
    text = ' '.join([word for word in text.split() if not re.search(r'https', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'http', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'youtube', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'vanBF1_BCyyo', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'story_fbid', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'listOLAK5uy', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'edition_id', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'News5GVGregorio', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'utm_', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'fbclid', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p23', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p24', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p25', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'category_', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'v_DUc7', word)])
    
    text = text.replace('...', ' ') # Remove ellipses
    text = ' '.join(text.split()) # Replace multiple spaces with a single space
    text = text.strip() # Remove leading/trailing spaces

    return text
dataset['Tweet'] = dataset['Tweet'].apply(clean_text)
print(dataset)

                                                   Tweet       Date Period
0      A year ago Any progress from the winning candi...   5/5/2022    PRE
1      Bongbong Marcos Sara Duterte express deepest g...   5/4/2022    PRE
2      I decide I only vote for BBM red_heart SARA DU...   5/7/2022    PRE
3      Mr Bbm and Ms Sara Duterte are obviously will ...   5/4/2022    PRE
4      THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...   5/7/2022    PRE
...                                                  ...        ...    ...
13553  Dictator Ferdinand Marcos from the Philippines...  5/28/2022   POST
13554  This election rewrites history People voted fo...  5/28/2022   POST
13555  Hey its a free country No one has monopoly of ...  5/28/2022   POST
13556                      HALAlan hashtagElection2022PH  5/29/2022   POST
13557  Cheating malalim na sugat kahit mahilom mahira...  5/31/2022   POST

[13558 rows x 3 columns]


In [21]:
# Check for Duplicate Tweets after cleaning of non-alphanumeric characters, spaces & links
print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 271


In [22]:
dups2 = dataset["Tweet"]
dups_listed2 = dataset[dups2.isin(dups2[dups2.duplicated()])].sort_values("Tweet")
print(dups_listed2)

                                                   Tweet       Date Period
3862   10 sample ballots were shaded Some had undervo...   5/4/2022    PRE
600    10 sample ballots were shaded Some had undervo...   5/4/2022    PRE
12039  2022 PH Election Mandaluyong City 12 Similar t...  5/26/2022   POST
8672   2022 PH Election Mandaluyong City 12 Similar t...  5/26/2022   POST
8318   23 Interestingly Leni had her worst performanc...  5/23/2022   POST
...                                                  ...        ...    ...
3237   high_voltage The Philippine election is just d...   5/6/2022    PRE
4992   libreng sakay for everyone here at Marilao Bul...   5/9/2022    DUR
4697   libreng sakay for everyone here at Marilao Bul...   5/9/2022    DUR
10637  oh why oh why when can we ever learn and rejec...  5/11/2022   POST
9057   oh why oh why when can we ever learn and rejec...  5/11/2022   POST

[511 rows x 3 columns]


In [23]:
# Drop the last duplicate and keep the first one
# Reset column index

dataset.drop_duplicates(subset='Tweet', keep="first", inplace=True)
dataset = dataset.reset_index()

In [24]:
# Drop old index column 
dataset.drop(['index'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago Any progress from the winning candi...,5/5/2022,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,5/4/2022,PRE
2,I decide I only vote for BBM red_heart SARA DU...,5/7/2022,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,5/4/2022,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,5/7/2022,PRE
...,...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13283,This election rewrites history People voted fo...,5/28/2022,POST
13284,Hey its a free country No one has monopoly of ...,5/28/2022,POST
13285,HALAlan hashtagElection2022PH,5/29/2022,POST


In [25]:
def add_colons(text):
    # Use a regular expression to find words with underscores
    pattern = r'\b(\w+)(?:_(\w+))+\b'
    
    # Define a function for substitution
    def replace(match):
        return f':{match.group(1)}_{match.group(2)}:'

    # Use re.sub to perform the substitution
    modified_text = re.sub(pattern, replace, text)
    
    return modified_text

In [26]:
dataset['Tweet'] = dataset['Tweet'].apply(add_colons)
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago Any progress from the winning candi...,5/5/2022,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,5/4/2022,PRE
2,I decide I only vote for BBM :red_heart: SARA ...,5/7/2022,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,5/4/2022,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,5/7/2022,PRE
...,...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13283,This election rewrites history People voted fo...,5/28/2022,POST
13284,Hey its a free country No one has monopoly of ...,5/28/2022,POST
13285,HALAlan hashtagElection2022PH,5/29/2022,POST


In [27]:
# 3 Convert text back to emojis
def convert_to_emojis(text):
    return emoji.emojize(text)
dataset['Tweet'] = dataset['Tweet'].apply(convert_to_emojis)
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago Any progress from the winning candi...,5/5/2022,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,5/4/2022,PRE
2,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...,5/7/2022,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,5/4/2022,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,5/7/2022,PRE
...,...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13283,This election rewrites history People voted fo...,5/28/2022,POST
13284,Hey its a free country No one has monopoly of ...,5/28/2022,POST
13285,HALAlan hashtagElection2022PH,5/29/2022,POST


In [30]:
# Create Missing Emojis Dictionary
missing_dict = {}
missing = pd.read_csv('Dictionaries/UpdatedEmojis.csv', encoding='utf-8')
with open('Dictionaries/UpdatedEmojis.csv', mode='r') as inp:
    reader = csv.reader(inp)
    headers = next(reader)
    missing_dict = {rows[0]:rows[1] for rows in reader}
missing_dict

{':backhand_index_pointing_down_mediumdark_skin_tone:': ':backhand_index_pointing_down:',
 ':backhand_index_pointing_down_mediumlight_skin_tone:': ':backhand_index_pointing_down_medium-light_skin_tone:',
 ':clapping_hands_mediumlight_skin_tone:': ':clapping_hands_medium-light_skin_tone:',
 ':DepEd_PH:': 'DepEd_PH',
 ':flexed_biceps_mediumlight_skin_tone:': ':flexed_biceps:',
 ':folded_hands_mediumdark_skin_tone:': ':folded_hands_medium-dark_skin_tone:',
 ':folded_hands_mediumlight_skin_tone:': ':folded_hands_medium-light_skin_tone:',
 ':globe_showing_AsiaAustralia:': ':globe_showing_Asia-Australia:',
 ':globe_showing_EuropeAfrica:': ':globe_showing_Europe-Africa:',
 ':hand_with_index_finger_and_thumb_crossed_mediumlight_skin_tone:': ':hand_with_index_finger_and_thumb_crossed_medium-light_skin_tone:',
 ':hashtagELEKSYON_DISCUSSION:': 'hashtagELEKSYON_DISCUSSION',
 ':hashtagFinney_Smith:': 'hashtagFinney_Smith',
 ':hashtagmartial_law:': 'hashtagmartial_law',
 ':hashtagTCPH_ANNOUNCEMENT:'

In [31]:
dataset['Tweet'] = dataset['Tweet'].apply(convert_to_emojis)
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago Any progress from the winning candi...,5/5/2022,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,5/4/2022,PRE
2,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...,5/7/2022,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,5/4/2022,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,5/7/2022,PRE
...,...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13283,This election rewrites history People voted fo...,5/28/2022,POST
13284,Hey its a free country No one has monopoly of ...,5/28/2022,POST
13285,HALAlan hashtagElection2022PH,5/29/2022,POST


In [32]:
def remove_standalone_underscores(text):
    # Use a regular expression to replace standalone underscores
    modified_text = re.sub(r'\b_\b', '', text)
    return modified_text

In [33]:
dataset['Tweet'] = dataset['Tweet'].apply(remove_standalone_underscores)

In [34]:
dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago Any progress from the winning candi...,5/5/2022,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,5/4/2022,PRE
2,I decide I only vote for BBM ❤️ SARA DUTERTE 💚...,5/7/2022,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,5/4/2022,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,5/7/2022,PRE
...,...,...,...
13282,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
13283,This election rewrites history People voted fo...,5/28/2022,POST
13284,Hey its a free country No one has monopoly of ...,5/28/2022,POST
13285,HALAlan hashtagElection2022PH,5/29/2022,POST


In [35]:
dataset.to_csv('CleanedTweets.csv', index=False, encoding='utf-8')

Raw Tweets

In [36]:
#Concatenate all datasets
raw_dataset = pd.concat([dataset_pre, dataset_dr, dataset_post], ignore_index=True)

In [37]:
#1 Drop unnecessary columns

raw_dataset.drop(['Username'], axis=1, inplace=True)
raw_dataset

Unnamed: 0,Tweet,Date,Period
0,A year ago! Any progress from the winning cand...,5/5/2022,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",5/4/2022,PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,5/7/2022,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,5/4/2022,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,5/7/2022,PRE
...,...,...,...
16638,Dictator Ferdinand Marcos from the Philippines...,5/28/2022,POST
16639,This election rewrites history. People voted f...,5/28/2022,POST
16640,"Hey, it’s a free country. No one has monopoly ...",5/28/2022,POST
16641,HALA-lan. \n\n#Election2022PH,5/29/2022,POST


In [38]:
raw_dataset.to_csv('RawTweets.csv', index=False, encoding='utf-8')