# Load libraries

In [1]:
# !pip install scikit-learn
# !pip install nltk
# !pip install emoji

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import emoji

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load data

In [3]:
# Load the datasets
rappler_docs = pd.read_excel('rappler_corpus.xlsx')
youtube_docs = pd.read_excel('youtube_corpus.xlsx')

# Standardize date_published column
# Rappler docs come in +08:00 timezone
rappler_docs['date_published'] = pd.to_datetime(
    rappler_docs['date_published']).dt.tz_convert(tz='Asia/Manila').dt.tz_localize(None)
# Youtube docs come in UTC timezone
youtube_docs['date_published'] = pd.to_datetime(
    youtube_docs['date_published']).dt.tz_convert(tz='Asia/Manila').dt.tz_localize(None)

# Assign source to each dataset
rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

# Combine the datasets
corpus = pd.concat([
  rappler_docs,
  youtube_docs
], ignore_index=True, axis=0)

corpus

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id
0,DOJ on 'slow' pace of Alice Guo case: 'We cann...,https://www.rappler.com/philippines/doj-respon...,2024-08-28 10:24:17,"MANILA, Philippines – The Department of Justic...",rappler,,
1,Lawyer who notarized Alice Guo's counter affid...,https://www.rappler.com/philippines/lawyer-not...,2024-08-28 07:00:00,"CLARK FREEPORT, Philippines – The lawyer who n...",rappler,,
2,Alice Guo and siblings fled Philippines by boat,https://www.rappler.com/philippines/alice-guo-...,2024-08-27 13:20:33,"MANILA, Philippines – Dismissed Bamban, Tarlac...",rappler,,
3,"Alice Guo's sister, Porac POGO staff to face S...",https://www.rappler.com/philippines/alice-guo-...,2024-08-27 10:03:09,"MANILA, Philippines – After an embarrassing la...",rappler,,
4,"Cassandra Ong, Sheila Guo in Congress custody ...",https://www.rappler.com/video/daily-wrap/augus...,2024-08-26 21:53:00,Here are today’s headlines – the latest news i...,rappler,,
...,...,...,...,...,...,...,...
206,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 19:24:46,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg
207,"That vp is rude, entitled and arrogant like he...",https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 21:13:32,"That vp is rude, entitled and arrogant like he...",youtube,2.0,UgwBBzeGtqWna6y4KUJ4AaABAg
208,What country is this? Very interesting.,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 22:16:52,What country is this? Very interesting.,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg
209,@@cvoutdoors9859palamunin,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 23:14:26,@@cvoutdoors9859palamunin,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg


# Preprocess text

## Load stopwords

In [4]:
from pandas.errors import EmptyDataError

try:
  basic_stopwords = list(
    # Handle empty data error
    pd.read_csv('basic_stopwords.txt', header=None).values.flatten()
  )
except EmptyDataError:
  basic_stopwords = []

try:
  domain_stopwords = list(
    pd.read_csv('domain_stopwords.txt', header=None).values.flatten()
  )
except EmptyDataError:
  domain_stopwords = []

In [5]:
def preprocess_text(corpus, text_column='text'):
  cleaned_corpus = corpus.copy()

  # Lowercase
  cleaned_corpus['cleaned_text'] = cleaned_corpus[text_column].str.lower()

  # Lemmatize (by default, lemmatize nouns)
  # Other options:
  #   'v' for verbs
  #   'a' for adjectives
  #   'r' for adverbs
  #   's' for satellites adjectives (adjectives that appear after verbs)
  lemmatizer = WordNetLemmatizer()
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
      lambda text: ' '.join(
        [lemmatizer.lemmatize(word, pos='n') for word in text.split()]
      )
  )

  # Stemmer
  stemmer = PorterStemmer()
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
      lambda text: ' '.join(
        [stemmer.stem(word) for word in text.split()]
      )
  )

  # Remove NLTK stopwords
  en_stopwords_list = stopwords.words('english')
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [
        word for word in text.split() if word not in en_stopwords_list
      ]
    )
  )

  # Remove basic stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in basic_stopwords]
    )
  )

  # Remove domain stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in domain_stopwords]
    )
  )

  # Remove trailing and leading whitespaces
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove non-alphanumeric characters
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\W', ' ', regex=True)

  # Remove numbers
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\d+', ' ', regex=True)

  # Remove emojis using emoji library
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in list(emoji.EMOJI_DATA.keys())]
    )
  )

  return cleaned_corpus['cleaned_text']

In [6]:
corpus['cleaned_text'] = preprocess_text(corpus)

In [7]:
corpus.to_excel('cleaned_corpus.xlsx', index=False)

In [8]:
corpus

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id,cleaned_text
0,DOJ on 'slow' pace of Alice Guo case: 'We cann...,https://www.rappler.com/philippines/doj-respon...,2024-08-28 10:24:17,"MANILA, Philippines – The Department of Justic...",rappler,,,manila philippin depart justic doj defend stra...
1,Lawyer who notarized Alice Guo's counter affid...,https://www.rappler.com/philippines/lawyer-not...,2024-08-28 07:00:00,"CLARK FREEPORT, Philippines – The lawyer who n...",rappler,,,clark freeport philippin lawyer notar counter ...
2,Alice Guo and siblings fled Philippines by boat,https://www.rappler.com/philippines/alice-guo-...,2024-08-27 13:20:33,"MANILA, Philippines – Dismissed Bamban, Tarlac...",rappler,,,manila philippin dismiss bamban tarlac mayor a...
3,"Alice Guo's sister, Porac POGO staff to face S...",https://www.rappler.com/philippines/alice-guo-...,2024-08-27 10:03:09,"MANILA, Philippines – After an embarrassing la...",rappler,,,manila philippin embarrass laps let dismiss ma...
4,"Cassandra Ong, Sheila Guo in Congress custody ...",https://www.rappler.com/video/daily-wrap/augus...,2024-08-26 21:53:00,Here are today’s headlines – the latest news i...,rappler,,,today headlin latest news philippin around wor...
...,...,...,...,...,...,...,...,...
206,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 19:24:46,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,ul l dignified pinapakyuhan nga si bong daza p...
207,"That vp is rude, entitled and arrogant like he...",https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 21:13:32,"That vp is rude, entitled and arrogant like he...",youtube,2.0,UgwBBzeGtqWna6y4KUJ4AaABAg,vp rude entitl arrog like father they r use sp...
208,What country is this? Very interesting.,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 22:16:52,What country is this? Very interesting.,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,countri this veri interesting
209,@@cvoutdoors9859palamunin,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 23:14:26,@@cvoutdoors9859palamunin,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,cvoutdoors palamunin
