# Loading the libraries

In [28]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [44]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Loading the data

In [18]:
# Load data
rappler_docs = pd.read_excel('rappler_corpus.xlsx')
youtube_docs = pd.read_excel('youtube_corpus.xlsx')

Unnamed: 0,title,link,date_published,text,like_count,reply_parent_id
0,Escape of guo is a reflection of country&#39;s...,https://www.youtube.com/watch?v=0v1XHgWyIvU&lc...,2024-08-27T10:40:10Z,Escape of guo is a reflection of country's st...,0,


## Inspecting the data

In [21]:
display(
  rappler_docs.head(1)
)
display(
  youtube_docs.head(1)
)

rappler_docs.info()
youtube_docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           49 non-null     object
 1   link            49 non-null     object
 2   date_published  49 non-null     object
 3   text            49 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            162 non-null    object
 1   link             162 non-null    object
 2   date_published   162 non-null    object
 3   text             162 non-null    object
 4   like_count       162 non-null    int64 
 5   reply_parent_id  18 non-null     object
dtypes: int64(1), object(5)
memory usage: 7.7+ KB


# Preprocess the data

## Convert datetime data type

In [None]:
# Convert date_published as datetime data type
rappler_docs['date_published'] = pd.to_datetime(
  rappler_docs['date_published']
)

youtube_docs['date_published'] = pd.to_datetime(
  youtube_docs['date_published']
)

## Assign source for each corpus

In [27]:
# Combine the two corpus
# Assign source to each corpus
rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

## Combine the corpus into one

In [None]:
# Combine the two corpus
docs = pd.concat([
  rappler_docs, youtube_docs
], ignore_index=True, axis=0)
display(docs.head(1))
display(docs.tail(1))

## Clean corpus

In [45]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')


def clean_docs(docs):
  '''# Copy clean_tweets function from the Exploratory Data Analysis'''
  # Transform into lowercase
  docs['cleaned_text'] = docs['text'].str.lower()

  # Remove usernames, non-alphanumeric characters, and links
  # docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'(@[A-Za-z0-9_]+)|([^A-Za-z0-9_ \t])|(\w+:\/\/\S+)', '')

  # Remove stopwords
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([word for word in row.split() if word not in (stopwords)]))

  # Lemmatize verbs
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(x, 'v') for x in row.split()]))

  # Lemmatize adjectives
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(x, 'a') for x in row.split()]))

  # Lemmatize nouns
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(x, 'n') for x in row.split()]))

  # Remove trailing and leading whitespaces
  docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'^\s+|\s+$', '')

  return docs

In [48]:
cleaned_docs = clean_docs(docs)
cleaned_docs.head(1)

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id,cleaned_text
0,DOJ on 'slow' pace of Alice Guo case: 'We cann...,https://www.rappler.com/philippines/doj-respon...,2024-08-28 10:24:17+08:00,"MANILA, Philippines – The Department of Justic...",rappler,,,"manila, philippine – department justice (doj) ..."


### Cleaning emojis

In [41]:
sample_sentence = "hello world @helloWorld 😅"
sample_sentence_2 = "Wait ko si dugong mag salita na JOKE LNG😁☺️<br>Kayu naman naniniwla agad😂"

sample_sentence_2.replace(r'(@[A-Za-z0-9_]+)|([^A-Za-z0-9_ \t])|(\w+:\/\/\S+)', '')

'Wait ko si dugong mag salita na JOKE LNG😁☺️<br>Kayu naman naniniwla agad😂'

# Topic Modeling

In [50]:
# !pip install pyLDAvis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import numpy as np

## Document-Term Matrix

In [52]:
vectorizer = CountVectorizer(
  max_df=0.95,  # terms that appears 95% within the corpus
  min_df=2,  # terms that appear at 2x within the corpus
  stop_words='english'  # ignore english stopwords
)
doc_term_matrix = vectorizer.fit_transform(
  cleaned_docs['cleaned_text']
)

doc_term_matrix.toarray().shape

(211, 2074)

In [53]:
pd.DataFrame(
  doc_term_matrix.toarray()
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2064,2065,2066,2067,2068,2069,2070,2071,2072,2073
0,0,0,0,0,0,0,0,1,1,0,...,0,0,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,0,2,0,0,...,0,0,2,6,0,0,0,0,0,0
2,0,0,0,0,0,1,0,2,1,1,...,0,0,3,1,1,0,0,0,0,0
3,0,2,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## LDA

In [54]:
LDA = LatentDirichletAllocation(
  n_components=5,  # no. of topics
  random_state=42  # random seed for replicability
)
LDA.fit(doc_term_matrix)

In [55]:
len(LDA.components_)

5

## Topic-Term Matrix

In [56]:
topic_term_matrix = LDA.components_
pd.DataFrame(topic_term_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2064,2065,2066,2067,2068,2069,2070,2071,2072,2073
0,0.200003,0.200004,0.200005,0.200002,0.2,0.200001,0.200001,0.200001,0.200003,0.200002,...,0.200001,0.200002,0.200003,0.200013,0.200001,0.200001,0.200001,0.200001,0.200001,0.200002
1,15.49133,0.200042,0.200001,2.200023,0.200059,1.197414,0.2,3.094403,2.005552,3.199222,...,0.470314,0.201674,10.091718,29.323772,0.22875,0.2,0.2,0.530551,3.195316,0.201674
2,1.828132,6.197786,0.200116,0.203274,2.19991,0.202583,0.201038,8.30539,4.71074,0.201561,...,0.200417,10.852795,0.20263,0.20079,4.17122,4.199589,0.200831,0.200461,0.200142,10.852782
3,6.244602,0.200908,1.074056,0.200001,0.2,0.2,2.540368,0.200207,1.249313,1.208523,...,1.929267,6.54191,0.28902,4.843709,0.2,0.2,1.309473,24.867519,0.202716,6.541934
4,7.235933,0.20126,1.325822,2.1967,0.200031,1.200002,5.858593,0.2,2.834392,8.190691,...,0.2,0.203619,2.216629,5.431717,0.20003,0.20041,2.089696,0.201467,0.201825,0.203609


In [63]:
topic1 = topic_term_matrix[0]
topic1

# sort the scores from lowest to highest
# will return all terms
topic1.argsort()

# select top 10 terms within topic1
topic1_term_index = topic1.argsort()[-10:]

# convert back to words/terms
[vectorizer.get_feature_names_out()[index] for index in topic1_term_index]

['ping',
 'kalokohan',
 'quiboloy',
 'dam',
 'pinoy',
 'davao',
 'escape',
 'mga',
 'country',
 'pera']

In [66]:
for topic_number, topic in enumerate(LDA.components_):
  print(f'The top 10 words for topic #{topic_number}')
  print(
    [
      vectorizer.get_feature_names_out()[term_index] for term_index in topic.argsort()[-5:]
    ]
  )
  print("\n")

The top 10 words for topic #0
['davao', 'escape', 'mga', 'country', 'pera']


The top 10 words for topic #1
['say', 'ng', 'ang', 'sa', 'na']


The top 10 words for topic #2
['arrest', 'ong', 'say', 'philippine', 'guo']


The top 10 words for topic #3
['ceza', 'philippine', 'say', 'game', 'pogos']


The top 10 words for topic #4
['say', 'south', 'lucky', 'house', 'roque']




## Document-Topic Matrix

In [67]:
doc_topic_matrix = LDA.transform(doc_term_matrix)
pd.DataFrame(doc_topic_matrix)

Unnamed: 0,0,1,2,3,4
0,0.000480,0.425988,0.572560,0.000485,0.000487
1,0.000512,0.751948,0.246500,0.000518,0.000523
2,0.000286,0.649432,0.349704,0.000289,0.000290
3,0.000462,0.029132,0.780483,0.000473,0.189451
4,0.141633,0.003203,0.848616,0.003261,0.003286
...,...,...,...,...,...
206,0.012581,0.540756,0.012509,0.012521,0.421633
207,0.014601,0.014394,0.014566,0.941997,0.014442
208,0.594876,0.100789,0.102305,0.101387,0.100643
209,0.200000,0.200000,0.200000,0.200000,0.200000


In [70]:
cleaned_docs['Topic'] = doc_topic_matrix.argmax(axis=1)
cleaned_docs[['cleaned_text', 'source', 'Topic']]
cleaned_docs.to_excel('topic_documents.xlsx')

Unnamed: 0,cleaned_text,source,Topic
0,"manila, philippine – department justice (doj) ...",rappler,2
1,"clark freeport, philippine – lawyer notarize c...",rappler,1
2,"manila, philippine – dismiss bamban, tarlac ma...",rappler,1
3,"manila, philippine – embarrass lapse let dismi...",rappler,2
4,today’s headline – late news philippine around...,rappler,2
...,...,...,...
206,ul*l dignified. pinapakyuhan nga si bong daza ...,youtube,1
207,"vp rude, entitle arrogant like father they're ...",youtube,3
208,country this? interesting.,youtube,0
209,@@cvoutdoors9859palamunin,youtube,0


In [77]:
cleaned_docs.query("Topic == 1 and source == 'rappler'")['text']
cleaned_docs.query("Topic == 1 and source == 'youtube'")['text']

50     BAKIT masama bang magsinungaling ? Sino bang t...
53     Grabe nakalagpas sa immigration, ano yan hindi...
56     Mtakas na talaga yan. Ininsulto ng sobra. May ...
59         Cancer na talaga ang lagayan sa be Pilipinas.
60     yan ang napala mo miss\nminasama mo pa si VP S...
                             ...                        
191    Sa na ma ramdan ni Rodrigo dutuerte ngayon kun...
195    Ano?sabi ni Marcos BongBong si Alice marami si...
198              Nasa senate tinu tutor ng abogago😂😂😂😂😂😂
201    Dont trust b.i. they are the source of leaks t...
206    Ul*l dignified. Pinapakyuhan nga si Bong Daza ...
Name: text, Length: 86, dtype: object

## Visualize using PyLDAViz

In [78]:
vocab = vectorizer.get_feature_names_out()
doc_lengths = [
  len(doc) for doc in docs['cleaned_text']
]
term_freq = doc_term_matrix.sum(axis=0)
term_freqs = np.array(term_freq).flatten()
vis_data = pyLDAvis.prepare(topic_term_matrix, doc_topic_matrix, doc_lengths, vocab, term_freqs)

pyLDAvis.display(vis_data)