In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Noah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Noah\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Noah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
youtube_docs = pd.read_csv('oilspill-comments.csv',index_col=0)
rappler_docs = pd.read_csv('oilspill-rappler.csv',index_col=0)

rappler_docs['date_published'] = pd.to_datetime(
    rappler_docs['date_published'], 
    errors='coerce',
)

youtube_docs['date_published'] = pd.to_datetime(
    youtube_docs['date_published'], 
    errors='coerce',
)

rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

docs = pd.concat([
    rappler_docs, youtube_docs
], ignore_index=True, axis=0)

In [3]:
lemmatizer = WordNetLemmatizer()
eng_stopwords = stopwords.words('english')

def clean_docs(docs):
  # Transform into lowercase
  docs['cleaned_text'] = docs['text'].str.lower()

  # Remove usernames, non-alphanumeric characters, and links
  docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'(@[A-Za-z0-9_]+)|([^A-Za-z0-9_ \t])|(\w+:\/\/\S+)', '')

  # Remove stopwords
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([word for word in row.split() if word not in (eng_stopwords)]))

  # Lemmatize verbs
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(x, 'v') for x in row.split()]))

  # Lemmatize adjectives
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(x, 'a') for x in row.split()]))

  # Lemmatize nouns
  docs['cleaned_text'] = docs['cleaned_text'].apply(
    lambda row: ' '.join([lemmatizer.lemmatize(x, 'n') for x in row.split()]))

  # Remove trailing and leading whitespaces
  docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'^\s+|\s+$', '')

  return docs

In [4]:
cleaned_rappler = clean_docs(rappler_docs)
cleaned_youtube = clean_docs(youtube_docs)

In [18]:
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import pyLDAvis
import pyLDAvis.gensim 
pyLDAvis.enable_notebook()

document term matrix

In [23]:
add_stopwords = {
    'mga',
    'yan',
    'ang',
    'ng',
    'po',
    'na',
    'sa',
    'yung',
    'lang',
    'naman',
    'ako',
    'pa',
    'eh',
    'si',
    'sila',
    'mo',
    'hindi',
    'kayo',
    'dyan', 
    'jan',
    'mag', 
    'pag',
    'nag',
    'ung', 
    'nga', 
    'para', 
    'kung', 
    'nang', 
    'nyo', 
    'di', 
    'kasi', 
    'wala', 
    'din', 
    'ba', 
    'ka', 
    'ko', 
    'nyan',
    'ay', 
    'lng', 
    'rin', 
    'ano', 
    'sino', 
    'bakit', 
    'yang', 
    'hahaha', 
    'dahil', 
    'nila', 
    'dapat', 
    'walang', 
    'pero', 
    'oil', 
    'langis',
    'spill', 
    'say', 
    'baka', 
    'manila', 
    'bay', 
    'ship', 
    'barko', 
    'man', 
    'ni', 
    'wag', 
    'diyan',  
    'yun', 
    'niyo', 
    'dito', 
    'tanker', 
    'philippine',  
    'sink', 
    'nmn', 
    'nman',
    'july', 
    'august', 
    '000', 
    'day',
    'mt', 
    'terranova', 
    'mv', 
    'mirola', 
    'mtkr', 
    'jason',
    'bradley', 
    'monday', 
    'tuesday', 
    'wednesday', 
    'thursday', 
    'friday', 
    'saturday', 
    'sunday', 
    'kaya', 
    'ito', 
    'kc', 
    'vessel', 
    'pcg', 
    'coast', 
    'guard', 
    'parang', 
    'cavite', 
    'bataan', 
    'mariveles', 
    'cabcaben', 
    # '19', '22', '28', '31',
    # 'thousand', 
}

stop_words = list(text.ENGLISH_STOP_WORDS.union(add_stopwords))

vectorizer = CountVectorizer(
    max_df=0.95, # terms that appear within 95% off the docs
    min_df=2, # terms that appear at least 2x within the corpus
    stop_words=stop_words, # ignore english stopwords
)

In [21]:
dt_matrix_rappler = vectorizer.fit_transform(
    cleaned_rappler['cleaned_text']
)

LDA = LatentDirichletAllocation(
    n_components=5,
    random_state=100
)
LDA.fit(dt_matrix_rappler)

topic_term_matrix = LDA.components_
doc_topic_matrix = LDA.transform(dt_matrix_rappler)

for index, topic in enumerate(LDA.components_):
  print(f"The top 10 words for topic #{index}: ")
  print(
      [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]]
  )

vocab = vectorizer.get_feature_names_out()
doc_lengths = [
    len(doc) for doc in rappler_docs['cleaned_text']
]
term_freq = dt_matrix_rappler.sum(axis=0)
term_freqs = np.array(term_freq).flatten()
vis_data = pyLDAvis.prepare(topic_term_matrix, doc_topic_matrix, doc_lengths, vocab, term_freqs)

pyLDAvis.display(vis_data)

The top 10 words for topic #0: 
['operation', 'justice', 'bureau', 'paihi', 'fish', 'report', 'liter', 'smuggle', 'remulla', 'fuel']
The top 10 words for topic #1: 
['liter', 'operation', 'company', 'cargo', 'harbor', 'star', 'industrial', 'balilo', 'siphon', 'fuel']
The top 10 words for topic #2: 
['cost', 'year', 'leak', 'coastal', 'result', 'environment', 'affect', 'senate', 'project', 'area']
The top 10 words for topic #3: 
['fishery', 'affect', 'consumption', 'week', 'shellfish', 'siphon', 'bfar', 'fisherfolk', 'compensation', 'fish']
The top 10 words for topic #4: 
['corporation', 'salvor', 'fuel', 'harbor', 'carry', 'water', 'liter', 'siphon', 'shogun', 'company']


In [24]:
dt_matrix_youtube = vectorizer.fit_transform(
    cleaned_youtube['cleaned_text']
)

LDA = LatentDirichletAllocation(
    n_components=5,
    random_state=42
)
LDA.fit(dt_matrix_youtube)

topic_term_matrix = LDA.components_
doc_topic_matrix = LDA.transform(dt_matrix_youtube)

for index, topic in enumerate(LDA.components_):
  print(f"The top 10 words for topic #{index}: ")
  print(
      [vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]]
  )

vocab = vectorizer.get_feature_names_out()
doc_lengths = [
    len(doc) for doc in youtube_docs['cleaned_text']
]
term_freq = dt_matrix_youtube.sum(axis=0)
term_freqs = np.array(term_freq).flatten()
vis_data = pyLDAvis.prepare(topic_term_matrix, doc_topic_matrix, doc_lengths, vocab, term_freqs)

pyLDAvis.display(vis_data)

The top 10 words for topic #0: 
['owner', 'natin', 'pilipinas', 'kawawa', 'ari', 'sana', 'talaga', 'tao', 'dagat', 'mangingisda']
The top 10 words for topic #1: 
['ngayon', 'damage', 'tsk', 'basura', 'sabotage', 'sabay', 'lumubog', 'sabotahe', 'karma', 'sinadya']
The top 10 words for topic #2: 
['use', 'good', 'pinayagan', 'ari', 'like', 'isda', 'pera', 'make', 'bagyo', 'alam']
The top 10 words for topic #3: 
['nuclear', 'alam', 'talaga', 'lahat', 'country', 'sana', 'miguel', 'plant', 'tao', 'san']
The top 10 words for topic #4: 
['dagat', 'news', 'bansa', 'lahat', 'student', 'gobyerno', 'percent', 'government', 'company', 'china']


In [19]:
pyLDAvis.display(vis_data)