# Dataset 1: articles_dataset.csv

#### a. Import required libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append("../")

import os
from collections import Counter
from time import time

import gensim
import numpy as np
import pandas as pd
import re
import string

from ds_utils.clustering import vectorize, mbkmeans_clusters
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

from gensim.models import Word2Vec
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from gensim.models.fasttext import FastText

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### b. Data Preprocessing and Text Preprocessing

In [None]:
import pandas as pd
df = pd.read_csv("./drive/MyDrive/STBI/Proyek/Dataset/articles_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,0,reuters,Reuters,Reuters Editorial,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,https://www.reuters.com/article/us-tesla-crash...,https://s4.reutersmedia.net/resources/r/?m=02&...,2019-09-03T16:22:20Z,WASHINGTON (Reuters) - The National Transporta...,0.0,0.0,0.0,2528.0,0.0
1,1,the-irish-times,The Irish Times,Eoin Burke-Kennedy,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,https://www.irishtimes.com/business/economy/un...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T10:32:28Z,The States jobless rate fell to 5.2 per cent l...,0.0,6.0,10.0,2.0,0.0
2,2,the-irish-times,The Irish Times,Deirdre McQuillan,"Louise Kennedy AW2019: Long coats, sparkling t...",Autumn-winter collection features designer’s g...,https://www.irishtimes.com/\t\t\t\t\t\t\t/life...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T14:40:00Z,Louise Kennedy is showing off her autumn-winte...,1.0,,,,
3,3,al-jazeera-english,Al Jazeera English,Al Jazeera,North Korean footballer Han joins Italian gian...,Han is the first North Korean player in the Se...,https://www.aljazeera.com/news/2019/09/north-k...,https://www.aljazeera.com/mritems/Images/2019/...,2019-09-03T17:25:39Z,"Han Kwang Song, the first North Korean footbal...",0.0,0.0,0.0,7.0,0.0
4,4,bbc-news,BBC News,BBC News,UK government lawyer says proroguing parliamen...,"The UK government's lawyer, David Johnston arg...",https://www.bbc.co.uk/news/av/uk-scotland-4956...,https://ichef.bbci.co.uk/news/1024/branded_new...,2019-09-03T14:39:21Z,,0.0,0.0,0.0,0.0,0.0


In [None]:
df.shape

(10437, 15)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10437 entries, 0 to 10436
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       10437 non-null  int64  
 1   source_id                        10437 non-null  object 
 2   source_name                      10437 non-null  object 
 3   author                           9417 non-null   object 
 4   title                            10435 non-null  object 
 5   description                      10413 non-null  object 
 6   url                              10436 non-null  object 
 7   url_to_image                     9781 non-null   object 
 8   published_at                     10436 non-null  object 
 9   content                          9145 non-null   object 
 10  top_article                      10435 non-null  float64
 11  engagement_reaction_count        10319 non-null  float64
 12  engagement_comment

In [None]:
# drop atribut 
df = df.drop(['Unnamed: 0', 'source_id', 'title', 'description', 'source_name', 'author', 'url', 'url_to_image', 'published_at', 'top_article', 'engagement_reaction_count', 'engagement_comment_count', 'engagement_share_count', 'engagement_comment_plugin_count'], axis = 1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10437 entries, 0 to 10436
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  9145 non-null   object
dtypes: object(1)
memory usage: 81.7+ KB


In [None]:
# Checking duplicate value
print(str(df.describe(include=object)))

                                                  content
count                                                9145
unique                                               8385
top     Chat with us in Facebook Messenger. Find out w...
freq                                                  125


In [None]:
# remove duplicate value (text) in contenct attribute
df.drop_duplicates(subset=['content'], keep='last')

Unnamed: 0,content
1,The States jobless rate fell to 5.2 per cent l...
2,Louise Kennedy is showing off her autumn-winte...
3,"Han Kwang Song, the first North Korean footbal..."
5,"""This Tender Land: a Novel"" (Atria Books), by ..."
6,LONDON (Reuters) - The European Union is waiti...
...,...
10429,For his latest installation of the SCAR Projec...
10431,There are numerous bargains waiting to be pick...
10432,Growth in the U.S. economys vast services sect...
10433,ZURICH/HONG KONG (Reuters) - The announcement ...


In [None]:
# Checking null value in attribute 
df.isna().sum()

content    1292
dtype: int64

In [None]:
#drop null value
df = df.dropna()

In [None]:
df

Unnamed: 0,content
0,WASHINGTON (Reuters) - The National Transporta...
1,The States jobless rate fell to 5.2 per cent l...
2,Louise Kennedy is showing off her autumn-winte...
3,"Han Kwang Song, the first North Korean footbal..."
5,"""This Tender Land: a Novel"" (Atria Books), by ..."
...,...
10428,"Just last week, the Disney Magic arrived to Ne..."
10429,For his latest installation of the SCAR Projec...
10431,There are numerous bargains waiting to be pick...
10432,Growth in the U.S. economys vast services sect...


In [None]:
# remove number
def remove_number(text):
    return  re.sub(r"\[0-9]+", "", str(text))
 
df = df['content'].apply(remove_number)

In [None]:
# remove punctuation
def punctuation(txt):
  return re.sub(r"[^\w\s]","", str(txt))

df = df.apply(punctuation)

In [None]:
import nltk
nltk.download('punkt')
# tokenization
def word_tokenize_wrapper(text):
  return word_tokenize(text)
df = df.apply(word_tokenize_wrapper)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#stopword removal
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
print(sw_nltk)
def stopword(text):
  words = [word for word in text if word.lower() not in sw_nltk]
  return words
df = df.apply(stopword)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bo

In [None]:
#normalization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
def lemma(text):
  lemmatizer = WordNetLemmatizer()
  Output= [lemmatizer.lemmatize(words_sent) for words_sent in text]
  return Output
df = df.apply(lemma)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


#### D. Pemodelan dengan Word Embedding

##### 1. FastText

In [None]:
# vocabulary
docs = df.values
tokenized_docs = df.values
vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

len(vocab)

35843

In [None]:
vocab.most_common(10)

[('char', 8773),
 ('said', 1712),
 ('Reuters', 1309),
 ('year', 1099),
 ('US', 1090),
 ('new', 824),
 ('say', 734),
 ('Thursday', 711),
 ('President', 702),
 ('one', 646)]

In [None]:
model2 = FastText(sentences=tokenized_docs, size = 100, workers=1, seed=42)

In [None]:
# test model
model2.wv.most_similar("moon", topn=5)

[('typhoon', 0.9996442198753357),
 ('Typhoon', 0.9995806813240051),
 ('Pokémon', 0.9992265105247498),
 ('Gordon', 0.9989269971847534),
 ('Solomon', 0.9987795352935791)]

In [None]:
vectorized_docs = vectorize(df, model = model2, strategy="average")
len(vectorized_docs), len(vectorized_docs[0])

(9145, 100)

#### E. Generate and analyze clusters

In [None]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [None]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.14
Inertia:572.8318026070413
Silhouette values:
    Cluster 2: Size:125 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 3: Size:89 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 31: Size:26 | Avg:0.95 | Min:0.34 | Max: 0.98
    Cluster 11: Size:69 | Avg:0.32 | Min:-0.05 | Max: 0.48
    Cluster 43: Size:86 | Avg:0.27 | Min:0.01 | Max: 0.46
    Cluster 30: Size:133 | Avg:0.23 | Min:-0.12 | Max: 0.46
    Cluster 26: Size:161 | Avg:0.22 | Min:-0.03 | Max: 0.43
    Cluster 25: Size:54 | Avg:0.20 | Min:-0.07 | Max: 0.50
    Cluster 12: Size:135 | Avg:0.20 | Min:-0.04 | Max: 0.40
    Cluster 4: Size:179 | Avg:0.19 | Min:-0.04 | Max: 0.39
    Cluster 6: Size:166 | Avg:0.18 | Min:-0.07 | Max: 0.42
    Cluster 13: Size:141 | Avg:0.16 | Min:-0.07 | Max: 0.37
    Cluster 23: Size:361 | Avg:0.15 | Min:0.01 | Max: 0.34
    Cluster 15: Size:309 | Avg:0.15 | Min:-0.02 | Max: 0.36
    Cluster 19: Size:127 | Avg:0.15 | Min:-0.07 | Max: 0.37
    Cluster 33: Size

In [None]:
print("Top terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model2.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: performance closely overturn ORourke Rhode 
Cluster 1: try aft due digital Kickoff 
Cluster 2: whats Find Messenger Facebooks Facebook 
Cluster 3: bulletin Jews bullet BBC news 
Cluster 4: prestigious Door aide denial Ivanka 
Cluster 5: Cyril caucus cash carbon emerge 
Cluster 6: Brazils spokesperson sponsor Varadkar supreme 
Cluster 7: chemical responsible trajectory effect draw 
Cluster 8: migrant follow Hollywood semiautonomous jewelry 
Cluster 9: Chile heat pace space explore 
Cluster 10: safely parallel attitude outdoor malicious 
Cluster 11: democratic impeach forum NOAA admit 
Cluster 12: hedge Feige cartridge edge capacity 
Cluster 13: tennis terrorism exporter Brett culprit 
Cluster 14: recovery record earthquake perfect read 
Cluster 15: controversial Sudanese entrepreneur responsibility architect 
Cluster 16: shrank lockdown smart barely enough 
Cluster 17: Rhode simultaneously draft Powell alcohol 
Cluster 18: displayed

In [None]:

test_cluster = 48
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:20]:
    print(docs[d])
    print("-------------")

['Two', 'highlight', 'attending', 'Mickeys', 'NotSoScary', 'Halloween', 'Party', 'family', 'always', 'grabbing', 'sidewalk', 'seat', 'Mickeys', 'BooToYou', 'Halloween', 'Parade', 'watching', 'party', 'exclusive', 'firework', 'show', 'year', 'spooky', 'new', 'eleme', '1196', 'char']
-------------
['Loog', 'Guitars', '20', 'year', 'ago', 'ambition', 'learn', 'play', 'guitar', 'petered', 'somewhere', 'learning', 'E', 'minor', 'G', 'C', 'mean', 'could', 'theory', 'play', 'number', 'pop', 'song', 'James', 'Blunt', 'U2', 'spring', 'mind', 'use', 'four', '381', 'char']
-------------
['PARISGoogle', 'paying', '1', 'billion', 'fine', 'back', 'tax', 'settle', 'pair', 'tax', 'dispute', 'France', 'faced', 'year', 'investigation', 'whether', 'properly', 'declared', 'activity', 'country', 'Alphabet', 'Inc', 'GOOGL', '13', '772', 'char']
-------------
['aware', 'ruleI', 'dont', 'agree', 'Crazy', 'Unfortunately', 'rule', 'applied', 'universally', 'many', 'athlete', 'missed', '3', 'test', '12', 'month'

In [None]:
docs

array([list(['WASHINGTON', 'Reuters', 'National', 'Transportation', 'Safety', 'Board', 'said', 'Tuesday', 'Tesla', 'Model', 'Autopilot', 'mode', 'struck', 'fire', 'truck', 'Culver', 'City', 'California', 'one', 'series', 'crash', 'board', 'investigating', 'involving', 'Teslas', 'driver', 'assistance', '478', 'char']),
       list(['States', 'jobless', 'rate', 'fell', '52', 'per', 'cent', 'last', 'month', 'according', 'latest', 'official', 'figure', 'higher', 'previously', 'reported', 'account', 'upward', 'revision', 'Central', 'Statistics', 'Office', 'CSO', 'one', 'several', 'last', 'two', 'year', 'Nonethe', '800', 'char']),
       list(['Louise', 'Kennedy', 'showing', 'autumnwinter', 'collection', 'Dublin', 'tonight', 'show', 'start', 'glittering', 'new', 'take', 'Black', 'Watch', 'tartan', 'Sixteen', 'model', 'present', '60', 'ensemble', '200', 'guest', 'including', 'designer', 'close', 'frie', '2156', 'char']),
       ...,
       list(['numerous', 'bargain', 'waiting', 'picked', 'st