# IMPORTING PACKAGES / LIBRARIES

In [None]:
import re
import nltk
import pkg_resources
import pickle 
import hdbscan
import pandas as pd
import umap.umap_ as umap
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import string

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, util
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from symspellpy import SymSpell, Verbosity
from nltk.tokenize import word_tokenize
from language_detector import detect_language
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from collections import Counter
from stop_words import get_stop_words

# DEFINING FUNCTIONS

In [None]:
def remove_punctuations(txt, punct=string.punctuation):
    """
    Function for removing punctuations.
    """
    return "".join([c for c in txt if c not in punct])

def word_preprocessing(raw_text):
    """
    Function for removing custom stopwords and numbers.
    """
    raw_text = raw_text.replace("\n", " ").replace("\r", " ").replace("'", "")
    raw_text = remove_punctuations(raw_text)
    letters_only_text = re.sub("[^a-zA-Z0-9]", " ", raw_text)
    words = letters_only_text.lower().split()
    stopword_set = set(stopwords.words("english") + open("custom_stopwords.txt", "r").read().split("\n"))
    cleaned_words = [w for w in words if w not in stopword_set]
    return " ".join(cleaned_words)

def save_as_pckl(obj, name):
    with open(name, 'wb') as files:
        pickle.dump(obj, files)

def load_pickle(name):
    pkl_object = pickle.load(open(name, 'rb'))
    return pkl_object

def tSNE_dim_reductor(sentence_embeddings,labels=None,sentences=None):
    X = sentence_embeddings
    X_embedded = TSNE(n_components=2).fit_transform(X)
    df_embeddings = pd.DataFrame(X_embedded)
    df_embeddings = df_embeddings.rename(columns={0:'x',1:'y'})
    if labels != None:
        df_embeddings = df_embeddings.assign(label=labels)
    if not sentences.empty:
        df_embeddings = df_embeddings.assign(text=sentences)
    return df_embeddings

def umap_dim_reductor(sentence_embeddings, n_neighbors):
    umap_embeddings = umap.UMAP(
        n_neighbors=n_neighbors, 
        n_components=2,
        min_dist=0.0, 
        metric='cosine'
    ).fit_transform(sentence_embeddings)
    return umap_embeddings


def hdbscan_clusterer(dim_reduced_embeddings, min_cluster_size):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric='euclidean',                      
        cluster_selection_method='eom'
        ).fit(dim_reduced_embeddings)
    return clusterer

def visualize_raw_df(dataframe, name_title):
    fig = px.scatter(
        dataframe, x='x', y='y',
        hover_data=['text'], 
        title = name_title, 
    )
    fig.show()
    return
    
def visualize_clustered_df(dataframe, name_title):
    fig = px.scatter (
        dataframe, 
        x='x', 
        y='y',
        color='labels', 
        symbol='labels', 
        hover_data=['text'], 
        title=name_title,
        labels={"labels": "Label", "labels": "Label"}
    )
    fig.update_coloraxes(showscale=False)
    fig.show()
    return

def get_number_of_clusters(clustered_df):
    _labels = {}
    for row in clustered_df:
        label = f"Label_{row}" if row != -1 else "Outliers"
        _labels[label] =  _labels[label] + 1 if label in _labels else 1
    _labels = dict(sorted(_labels.items(), key=lambda item: item[1], reverse=True))
    clusters = len(list(_labels.items()))-1
    return _labels, clusters


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)
    return tf_idf, count

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (
        df.groupby(['Topic']).text.count().reset_index().rename(
            {"Topic": "Topic", "text": "Size"}, 
            axis='columns'
            ).sort_values("Size", ascending=False
        )
    )
    return topic_sizes

def generate_word_cloud(top_n_words, topic, i):
    words = dict((x, y) for x, y in top_n_words[topic])
    wordcloud = WordCloud(
        width=800, height=560,
        background_color='black', 
        collocations=False,
        min_font_size=10
    ).fit_words(words)
    # title = f"Topic {i} | Label: {topic}" if topic != -1 else "Outliers"
    title = f"Label: {topic}" if topic != -1 else "Outliers"
    fig = plt.figure(figsize=(4, 2.8), facecolor=None)
    fig.suptitle(title, verticalalignment="baseline")
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)

# LOADING DATASET

In [None]:
df = pd.read_csv("tagalog_datasets/tagalog_newspapers/raw_news_articles_136k.csv")
print(f"DATASET SHAPE: {df.shape}")
df.head(10)

# PREPROCESSING

Remove unnecessary column and rows with empty values.

In [None]:
df.drop('Unnamed: 0', inplace=True, axis=1)
df = df.dropna().reset_index(drop=True)
print(f"DATASET SHAPE: {df.shape}")
df.head()

Removed punctuations & stopwords then write the result to a new column.

In [None]:
preprocessed_sentences = [word_preprocessing(text) for text in df.Text]
df["Processed"] = preprocessed_sentences
print(f"DATASET SHAPE: {df.shape}")
df.head()

Remove duplicate values and empty string values.

In [None]:
df.drop_duplicates()
df = df[df.Processed != ""]
print(f"DATASET SHAPE: {df.shape}")
df.head()

Reset index of the cleaned dataframe.

In [None]:
df = df.reset_index()
df.drop('index', inplace=True, axis=1)
print(f"DATASET SHAPE: {df.shape}")
df.head()

Saving dataframe to csv file for future usage.

In [None]:
# df.to_csv("tagalog_datasets/tagalog_newspapers/preprocessed_news_articles_X.csv")

Loading dataframe from csv file.

In [None]:
df = pd.read_csv("tagalog_datasets/tagalog_newspapers/preprocessed_news_articles_136k.csv")
df.drop('Unnamed: 0', inplace=True, axis=1)
print(f"DATASET SHAPE: {df.shape}")
df.head()

Create corpus from dataframe.

In [None]:
raw_corpus = df.Text
corpus = df.Processed
corpus

# EXTRACTING SENTENCE EMBEDDINGS

DOWNLOADING PRETRAINED MODEL:

* Model Name: roberta-tagalog-base
* Source: https://huggingface.co/jcblaise/roberta-tagalog-base
* By: Jan Christian Blaise Cruz - https://blaisecruz.com

In [None]:
# pretrained_model_name = "jcblaise/roberta-tagalog-base"
# model = SentenceTransformer(pretrained_model_name)
# save_as_pckl(model, "model/roberta-tagalog-model.pkl")

Loading model from pickle file.

In [None]:
model = load_pickle("model/roberta-tagalog-model.pkl")

Extracting corpus sentence embeddings using the pretrained model and saving it to a pickle file for future usage.

In [None]:
# sentence_embeddings = model.encode(corpus, show_progress_bar=True)
# save_as_pckl(sentence_embeddings, "tagalog_datasets/tagalog_newspapers/sentence_embeddings_preprocessed_136k.pkl")

Loading the sentence embeddings from pickle file.

In [None]:
sentence_embeddings = load_pickle("tagalog_datasets/tagalog_newspapers/sentence_embeddings_preprocessed_136k.pkl")
print(f"Sentence Emeddings Shape: {sentence_embeddings.shape}")
print(f"Embeddings Array: {sentence_embeddings}")

# DIMENSIONALITY REDUCTION

Reducing dimensions using Uniform Manifold Applicaitons Projections (UMAP).

In [None]:
umap_embeddings = umap_dim_reductor(sentence_embeddings, n_neighbors=30)
print(f"Sentence Emeddings Shape (reduced dimensions): {umap_embeddings.shape}")

Creating Dataframe from the sentence embeddings reduced dimensions.

In [None]:
umap_df = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
umap_df['text'] = raw_corpus
umap_df.head()

# RAW DATA VISUALIZATION

Visualize unclustered sentence embeddings.

In [None]:
visualize_raw_df(umap_df, "Tagalog News Articles")

# CLUSTERING USING HDBSCAN

Create clusters using Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN).

In [None]:
clusterer = hdbscan_clusterer(umap_embeddings, min_cluster_size=450)
clustered_result = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
clustered_result['text'] = raw_corpus
clustered_result['labels'] = clusterer.labels_
clustered_result.head()

# CLUSTERED DATA VISUALIZATION

In [None]:
labels, clusters =  get_number_of_clusters(clustered_result.labels)
print(f"\nThere are a total of '{clusters}' clusters generated.\n")
print('Setence Distribution per Clusters:\n')
print(labels)
visualize_clustered_df(clustered_result, "Tagalog News Articles (clustered with outliers)")

In [None]:
outliers = clustered_result[clustered_result.labels == -1]
visualize_clustered_df(outliers, "Tagalog News Articles (outliers)")

# REMOVING OUTLIERS

In [None]:
without_outliers = clustered_result[clustered_result.labels != -1]
visualize_clustered_df(without_outliers, "Tagalog News Paper (clustered without outliers)")

# TF-IDF OF TOP WORDS

In [None]:
_clustered_result = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
_clustered_result['text'] = corpus
_clustered_result['labels'] = clusterer.labels_

In [None]:
docs_df = pd.DataFrame(_clustered_result, columns=["text"])
docs_df['Topic'] = clusterer.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_df.head()

In [None]:
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'text': ' '.join})
docs_per_topic.head()

In [None]:
tf_idf, count = c_tf_idf(docs_per_topic.text.values, m=len(clustered_result))

In [None]:
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=50)
topic_sizes = extract_topic_sizes(docs_df)
topic_sizes

In [None]:
n = 0
for rows in topic_sizes.Topic:
    generate_word_cloud(top_n_words, rows, n)
    n = n + 1

# CLUSTERING THE OUTLIERS

In [None]:
outliers_df = pd.DataFrame(list(outliers.text),  columns=['text'])
outliers_df

In [None]:
outliers_embeddings = model.encode(outliers_df.text)

In [None]:
outliers_embeddings.shape

In [None]:
umap_outlier_embeddings = umap_dim_reductor(outliers_embeddings, n_neighbors=20)
umap_outlier_df = pd.DataFrame(umap_outlier_embeddings, columns=['x', 'y'])
umap_outlier_df['text'] = outliers_df.text

In [None]:
visualize_raw_df(umap_outlier_df, "Outliers Raw")

In [None]:
clusterer = hdbscan_clusterer(umap_outlier_embeddings, min_cluster_size=30)
clustered_outliers_result = pd.DataFrame(umap_outlier_embeddings, columns=['x', 'y'])
clustered_outliers_result['text'] = outliers_df.text
clustered_outliers_result['labels'] = clusterer.labels_
clustered_outliers_result.head()

In [None]:
o_labels, o_clusters =  get_number_of_clusters(clustered_outliers_result.labels)
print(f"\nClusters: {clusters}\n")
print('Setence Distribution per Clusters:\n')
print(o_labels)
visualize_clustered_df(clustered_outliers_result, "Outliers Clustered")