In [4]:
#Reprocess will create a csv file that will add the non processed tweets to another column 
#along with their preprocessed versions

#import necessary libraries
import pandas as pd
import csv
import re
import validators
import emoji
import unidecode
import nltk
import pickle
nltk.download('stopwords')
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import WhitespaceTokenizer


[nltk_data] Downloading package stopwords to C:\Users\Kirby
[nltk_data]     Wenceslao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Pre Processing

#Declare Stop Words

filipino_stopwords = set(
    """
akin
aking
ako
alin
am
amin
aming
ang
ano
anumang
apat
at
atin
ating
ay
bababa
bago
bakit
bawat
bilang
dahil
dalawa
dapat
din
dito
doon
gagawin
gayunman
ginagawa
ginawa
ginawang
gumawa
gusto
habang
hanggang
hindi
huwag
iba
ibaba
ibabaw
ibig
ikaw
ilagay
ilalim
ilan
inyong
isa
isang
itaas
ito
iyo
iyon
iyong
ka
kahit
kailangan
kailanman
kami
kanila
kanilang
kanino
kanya
kanyang
kapag
kapwa
karamihan
katiyakan
katulad
kaya
kaysa
ko
kong
kulang
kumuha
kung
laban
lahat
lamang
likod
lima
maaari
maaaring
maging
mahusay
makita
marami
marapat
masyado
may
mayroon
mga
minsan
mismo
mula
muli
na
nabanggit
naging
nagkaroon
nais
nakita
namin
napaka
narito
nasaan
ng
ngayon
ni
nila
nilang
nito
niya
niyang
noon
o
pa
paano
pababa
paggawa
pagitan
pagkakaroon
pagkatapos
palabas
pamamagitan
panahon
pangalawa
para
paraan
pareho
pataas
pero
pumunta
pumupunta
sa
saan
sabi
sabihin
sarili
sila
sino
siya
tatlo
tayo
tulad
tungkol
una
walang
""".split()
)

from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

search = "leni robredo bongbong marcos isko moreno domagoso manny pacman pacquiao ping lacson ernie abella leody de guzman norberto gonzales jose montemayor jr faisal mangondato"
candidatelist = search.split(" ")

#pre-process tweet input
def pre_process_tweet(tweet_input):
    
    #Step 1 - Extract Tweet from input
    #Tweet = tweet_input
    tweet = tweet_input.strip().replace("\n"," ")
    
    #Step 2 - Data Deidentification
    output = ""
    sentence = tweet.split(" ")
    for part in sentence:
        if not re.match(r"(^|[^@\w])@(\w{1,15})\b", part):
            if len(output) == 0:
                output = f"{part}"
            else:
                output = f"{output} {part}"

    tweets_de_identified = output
    
    #Step 3 - URL Removal
    output = ""
    sentence = tweets_de_identified.split(" ")
    for part in sentence:
        valid = validators.url(part)

        if (not valid == True):
            if len(output) == 0:
                output = f"{part}"
            else:
                output = f"{output} {part}"
                
    tweets_url_removed = output
    
    #Step 4 - Special Character Processing
    
    emoji_removed = emoji.replace_emoji(tweets_url_removed, replace="[emoji]")
    output = ""
    sentence = emoji_removed.split(" ")
    
    for part in sentence:
        if not (re.match(r"^[_\W]+$", part) or "[emoji]" in part):
            if len(output) == 0:
                output = f"{part}"
            else:
                output = f"{output} {part}"
    
    tweets_specialcharacters_removed = output
    
    #Step 5 - Normalization, lowercase>removediacritics>remove numerics and symbols>stopwords
    
    #lowercase the text
    lowercased_input = tweets_specialcharacters_removed.lower()

    #remove diacritics
    diacritics_removed = unidecode.unidecode(lowercased_input)

    output = ""
    sentence = diacritics_removed.split(" ")

    for part in sentence:
        part = re.sub("[^A-Za-z ]+$", "", part)
        part = re.sub("^[^A-Za-z #]+", "", part)
        if not (len(part) <= 1 or re.match(r"[^#a-zA-Z]", part) or part in english_stopwords or part in filipino_stopwords or any(part in x for x in candidatelist)):     
            if len(output) == 0:
                output = f"{part}"
            else:
                output = f"{output} {part}"  
                
    tweets_normalized = output
    
    #Step 6 - Hashtag Processing, removing the hashtags from the tweet
    output = ""
    sentence = tweets_normalized.split(" ")

    for part in sentence:
        if not re.match(r"#(\w+)", part):
            if len(output) == 0:
                output = f"{part}"
            else:
                output = f"{output} {part}"
                
    tweets_hashtags_removed = output  
    #Step 7 - Tokenization
    tokenizer = WhitespaceTokenizer()
    
    output = tokenizer.tokenize(tweets_hashtags_removed)
    
    tweets_tokenized = output
    tokens = ','.join(str(s) for s in tweets_tokenized)
    
    
    return tweets_tokenized

In [6]:
#Extraction of Filipino Tweets
#Refers to opening the raw data containing the Filipino Tweets
#into the python program

data_path = "dataset.csv"
#using pandas
df = pd.read_csv(data_path)
df

Unnamed: 0,Tweet Content,Sentiment,Label
0,Worst Bong ever. https://t.co/QA7R8VYppC,Negative,Hate
1,what i dont like about leni robredo's platform...,Negative,Hate
2,Ito ang tunay na survey ni VP Leni Robredo #1 ...,Negative,Hate
3,(3) BBM sued for Pork Barrel Scam\n\nhttps://t...,Negative,Hate
4,Sabog din sumagot tong si Norberto Gonzales no...,Negative,Hate
...,...,...,...
5115,President Leni Robredo and Vice President Kiko...,Neutral,Non-hate
5116,@jillrobredo 🌺🌺🌺\nthank you din kay @maraceped...,Neutral,Non-hate
5117,LOOK: Presidential candidate Bongbong Marcos m...,Neutral,Non-hate
5118,@itsmaxandcheese Leni Robredo for President 2022,Neutral,Non-hate


In [7]:
tweet_list = df['Tweet Content'].values
preprocessed_list = []
for tweet in tweet_list:
    preprocessed_list.append(pre_process_tweet(tweet))

df['Preprocessed_Tweet'] = preprocessed_list

In [8]:
df

Unnamed: 0,Tweet Content,Sentiment,Label,Preprocessed_Tweet
0,Worst Bong ever. https://t.co/QA7R8VYppC,Negative,Hate,"[worst, ever]"
1,what i dont like about leni robredo's platform...,Negative,Hate,"[dont, like, robredo's, platforms, likes, play..."
2,Ito ang tunay na survey ni VP Leni Robredo #1 ...,Negative,Hate,"[tunay, survey, vp, pinaka-ayaw, ganyan, tal, ..."
3,(3) BBM sued for Pork Barrel Scam\n\nhttps://t...,Negative,Hate,"[bbm, sued, pork, barrel, scam, pcgg, blocked,..."
4,Sabog din sumagot tong si Norberto Gonzales no...,Negative,Hate,"[sabog, sumagot, tong, si, padala, daw, tropa,..."
...,...,...,...,...
5115,President Leni Robredo and Vice President Kiko...,Neutral,Non-hate,"[president, vice, president, kiko, pangilinan,..."
5116,@jillrobredo 🌺🌺🌺\nthank you din kay @maraceped...,Neutral,Non-hate,"[thank, kay, rappler, nice, reporting, vp, ral..."
5117,LOOK: Presidential candidate Bongbong Marcos m...,Neutral,Non-hate,"[look, presidential, candidate, meets, ten, go..."
5118,@itsmaxandcheese Leni Robredo for President 2022,Neutral,Non-hate,[president]


In [9]:
df.to_csv('comp_dataset_binary.csv', index=False)