# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import regex as re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import seaborn as sns
import csv
from chardet import detect
from spellchecker import SpellChecker
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading all Datasets

In [2]:
dataset_pre = pd.read_csv('Datasets/PreElection.csv')

In [3]:
dataset_dr = pd.read_csv('Datasets/DuringElection.csv')

In [4]:
dataset_post = pd.read_csv('Datasets/PostElection.csv')

# Adding Election Period Labels by Adding Another Column

In [5]:
dataset_pre['Period'] = "PRE"


In [6]:
dataset_dr['Period'] = "DUR"

In [7]:
dataset_post['Period'] = "POST"

# Cleaning Datasets

In [8]:
#Concatenate all datasets
dataset = pd.concat([dataset_pre, dataset_dr, dataset_post], ignore_index=True)

In [9]:
dataset.shape

(16646, 4)

In [10]:
#1 Drop unnecessary columns

dataset.drop(['Date','Username'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Period
0,A year ago! Any progress from the winning cand...,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,PRE
...,...,...
16641,Dictator Ferdinand Marcos from the Philippines...,POST
16642,This election rewrites history. People voted f...,POST
16643,"Hey, it’s a free country. No one has monopoly ...",POST
16644,HALA-lan. \n\n#Election2022PH,POST


In [11]:
#Get the exact duplicate tweets in the dataframe
print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 3108


In [12]:
dups = dataset["Tweet"]
dups_listed = dataset[dups.isin(dups[dups.duplicated()])].sort_values("Tweet")
print(dups_listed)

                                                   Tweet Period
3546     @biancadava\rLOOK: Members of the deaf commu...    PRE
4578     @biancadava\rLOOK: Members of the deaf commu...    PRE
3681     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...    PRE
4212     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...    PRE
3572     @nujp\rThis elections, never forget those wh...    PRE
...                                                  ...    ...
3790   🚨HALALAN 2022 ADVICE🚨\rplease! wag kayong papa...    PRE
15024  🤠Alrite let's discuss the Top 10-14 candidates...   POST
11312  🤠Alrite let's discuss the Top 10-14 candidates...   POST
110    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...    PRE
784    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...    PRE

[5440 rows x 2 columns]


In [13]:
# 2 Remove Exact Duplicate Tweets
# Drop the last duplicate and keep the first one
# Reset column index

dataset.drop_duplicates(subset='Tweet', keep="first", inplace=True)
dataset = dataset.reset_index()

In [14]:
# Double check if duplicate tweets were dropped

print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 0


In [15]:
# Drop old index column
dataset.drop(['index'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Period
0,A year ago! Any progress from the winning cand...,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,PRE
...,...,...
13533,Dictator Ferdinand Marcos from the Philippines...,POST
13534,This election rewrites history. People voted f...,POST
13535,"Hey, it’s a free country. No one has monopoly ...",POST
13536,HALA-lan. \n\n#Election2022PH,POST


In [16]:
dataset['Tweet'] = dataset['Tweet'].str.replace("#", "hashtag")
dataset

Unnamed: 0,Tweet,Period
0,A year ago! Any progress from the winning cand...,PRE
1,"Bongbong Marcos, Sara Duterte express deepest ...",PRE
2,I decide I only vote for\rBBM❤️ SARA DUTERTE💚 ...,PRE
3,Mr. Bbm and Ms. Sara Duterte are obviously wil...,PRE
4,THE CROWD IS INSANE. BBM YOULL FOREVER BE INSA...,PRE
...,...,...
13533,Dictator Ferdinand Marcos from the Philippines...,POST
13534,This election rewrites history. People voted f...,POST
13535,"Hey, it’s a free country. No one has monopoly ...",POST
13536,HALA-lan. \n\nhashtagElection2022PH,POST


In [17]:
# 5 Remove non-alphanumeric characters, spaces & links
def clean_text(text):
    text = text.replace('?', ' ').replace('!', ' ') # Remove question marks, exclamation points
    text = ' '.join([word for word in text.split() if not word.startswith('@')]) # Remove mention tags
    text = text.replace('\n', ' ') # Remove newline characters

    # Remove all non-alphanumeric characters except hashtags, underscores, and spaces
    text = re.sub(r'[^a-zA-Z0-9_#\s]', '', text)

# Remove words containing "http" and "https"
    text = ' '.join([word for word in text.split() if not re.search(r'https', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'http', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'youtube', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'vanBF1_BCyyo', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'story_fbid', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'listOLAK5uy', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'edition_id', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'News5GVGregorio', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'utm_', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'fbclid', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p23', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p24', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'p25', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'category_', word)])
    text = ' '.join([word for word in text.split() if not re.search(r'v_DUc7', word)])
    
    text = text.replace('...', ' ') # Remove ellipses
    text = ' '.join(text.split()) # Replace multiple spaces with a single space
    text = text.strip() # Remove leading/trailing spaces
    return text

dataset['Tweet'] = dataset['Tweet'].apply(clean_text)
print(dataset)

                                                   Tweet Period
0      A year ago Any progress from the winning candi...    PRE
1      Bongbong Marcos Sara Duterte express deepest g...    PRE
2      I decide I only vote for BBM SARA DUTERTE PULA...    PRE
3      Mr Bbm and Ms Sara Duterte are obviously will ...    PRE
4      THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...    PRE
...                                                  ...    ...
13533  Dictator Ferdinand Marcos from the Philippines...   POST
13534  This election rewrites history People voted fo...   POST
13535  Hey its a free country No one has monopoly of ...   POST
13536                      HALAlan hashtagElection2022PH   POST
13537  Cheating malalim na sugat kahit mahilom mahira...   POST

[13538 rows x 2 columns]


In [18]:
# Check for Duplicate Tweets after cleaning of non-alphanumeric characters, spaces & links
print("Duplicate Tweets:", len(dataset['Tweet'])-len(dataset['Tweet'].drop_duplicates()))

Duplicate Tweets: 331


In [19]:
dups2 = dataset["Tweet"]
dups_listed2 = dataset[dups2.isin(dups2[dups2.duplicated()])].sort_values("Tweet")
print(dups_listed)

                                                   Tweet Period
3546     @biancadava\rLOOK: Members of the deaf commu...    PRE
4578     @biancadava\rLOOK: Members of the deaf commu...    PRE
3681     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...    PRE
4212     @mommydotsorry\r#Halalan2022 #UniteamBBMSara...    PRE
3572     @nujp\rThis elections, never forget those wh...    PRE
...                                                  ...    ...
3790   🚨HALALAN 2022 ADVICE🚨\rplease! wag kayong papa...    PRE
15024  🤠Alrite let's discuss the Top 10-14 candidates...   POST
11312  🤠Alrite let's discuss the Top 10-14 candidates...   POST
110    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...    PRE
784    🧵on 🇹🇼#Halalan2022| I’m sorry for failing y’al...    PRE

[5440 rows x 2 columns]


In [20]:
# Drop the last duplicate and keep the first one
# Reset column index

dataset.drop_duplicates(subset='Tweet', keep="first", inplace=True)
dataset = dataset.reset_index()

In [21]:
# Drop old index column 
dataset.drop(['index'], axis=1, inplace=True)
dataset

Unnamed: 0,Tweet,Period
0,A year ago Any progress from the winning candi...,PRE
1,Bongbong Marcos Sara Duterte express deepest g...,PRE
2,I decide I only vote for BBM SARA DUTERTE PULA...,PRE
3,Mr Bbm and Ms Sara Duterte are obviously will ...,PRE
4,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...,PRE
...,...,...
13202,Dictator Ferdinand Marcos from the Philippines...,POST
13203,This election rewrites history People voted fo...,POST
13204,Hey its a free country No one has monopoly of ...,POST
13205,HALAlan hashtagElection2022PH,POST


# Add Labels

In [22]:
dataset['Tweets for Form'] = dataset['Tweet']

In [23]:
def insert_label(row):
    period = row['Period']
    text = row['Tweet']

    if period == 'PRE':
        return 'PRE - ' + text
    elif period == 'DUR':
        return 'DUR - ' + text
    elif period == 'POST':
        return 'POST - ' + text
    
    return text

dataset['Tweet'] = dataset.apply(insert_label, axis=1)
dataset

Unnamed: 0,Tweet,Period,Tweets for Form
0,PRE - A year ago Any progress from the winning...,PRE,A year ago Any progress from the winning candi...
1,PRE - Bongbong Marcos Sara Duterte express dee...,PRE,Bongbong Marcos Sara Duterte express deepest g...
2,PRE - I decide I only vote for BBM SARA DUTERT...,PRE,I decide I only vote for BBM SARA DUTERTE PULA...
3,PRE - Mr Bbm and Ms Sara Duterte are obviously...,PRE,Mr Bbm and Ms Sara Duterte are obviously will ...
4,PRE - THE CROWD IS INSANE BBM YOULL FOREVER BE...,PRE,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...
...,...,...,...
13202,POST - Dictator Ferdinand Marcos from the Phil...,POST,Dictator Ferdinand Marcos from the Philippines...
13203,POST - This election rewrites history People v...,POST,This election rewrites history People voted fo...
13204,POST - Hey its a free country No one has monop...,POST,Hey its a free country No one has monopoly of ...
13205,POST - HALAlan hashtagElection2022PH,POST,HALAlan hashtagElection2022PH


In [24]:
del dataset['Period']

In [25]:
dataset

Unnamed: 0,Tweet,Tweets for Form
0,PRE - A year ago Any progress from the winning...,A year ago Any progress from the winning candi...
1,PRE - Bongbong Marcos Sara Duterte express dee...,Bongbong Marcos Sara Duterte express deepest g...
2,PRE - I decide I only vote for BBM SARA DUTERT...,I decide I only vote for BBM SARA DUTERTE PULA...
3,PRE - Mr Bbm and Ms Sara Duterte are obviously...,Mr Bbm and Ms Sara Duterte are obviously will ...
4,PRE - THE CROWD IS INSANE BBM YOULL FOREVER BE...,THE CROWD IS INSANE BBM YOULL FOREVER BE INSAN...
...,...,...
13202,POST - Dictator Ferdinand Marcos from the Phil...,Dictator Ferdinand Marcos from the Philippines...
13203,POST - This election rewrites history People v...,This election rewrites history People voted fo...
13204,POST - Hey its a free country No one has monop...,Hey its a free country No one has monopoly of ...
13205,POST - HALAlan hashtagElection2022PH,HALAlan hashtagElection2022PH


In [26]:
dataset.to_csv('label_guide.csv', index=False)