In [1]:
import numpy as np
import umap

from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from dataset.dataset import Dataset
from sklearn.preprocessing import StandardScaler
from bertopic import BERTopic
from sklearn.ensemble import RandomForestClassifier
from constants import CLEANED_DATASET_PATH

In [2]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


## Baseline Bertopic

In [56]:
#Create Model
s_bert = BERTopic(calculate_probabilities=True) #true for soft clustering multiple topics per documents, false for one topic per document

In [57]:
#train model on X
topics, probs = s_bert.fit_transform(X_train) 

In [58]:
datapoints, features = np.shape(probs)
print("Number of rows: ", datapoints)
print("Number of Features(topics): ", features)

Number of rows:  10000
Number of Features(topics):  158


In [59]:
s_bert.get_topic_info() #Hard clustering name = topics, representation = keywords, representative docs = docs with that topic

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3579,-1_binay_poe_mo_si,"[binay, poe, mo, si, duterte, kay, mar, roxas,...","[poe duterte go grace poe, rt kay binay, si bi..."
1,0,817,0_mar_roxas_si_pag,"[mar, roxas, si, pag, matuwid, nanalo, daang, ...","[wag lang si mar roxas, wag lang si mar roxas,..."
2,1,337,1_vp_vice_mayor_presidente,"[vp, vice, mayor, presidente, president, jejom...",[puro makati pinagmamalaki binay nga yung naga...
3,2,242,2_poe_grace_si_manalo,"[poe, grace, si, manalo, duterte, lang, sen, w...",[sec mar mag give way kay sen grace kasi mas c...
4,3,162,3_vote_mar_presidential_need,"[vote, mar, presidential, need, election, win,...","[mar roxas need ofw vote, mar roxas need ofw v..."
...,...,...,...,...,...
154,153,11,153_dutertetilltheend_uneducated_ider_passbooks,"[dutertetilltheend, uneducated, ider, passbook...","[mar roxas dutertetilltheend, mar roxas uneduc..."
155,154,11,154_tvc_fan_picture_yupangco,"[tvc, fan, picture, yupangco, masabing, ehe, d...",[kairita yung tvc binay nang damay ibang tao e...
156,155,11,155_duterte2016_paid_resolve_columns,"[duterte2016, paid, resolve, columns, postives...","[duterte2016, duterte2016, duterte2016]"
157,156,10,156_political_ad_260m_nabwiset,"[political, ad, 260m, nabwiset, galang, 300m, ...",[ba talaga ititigil ung political ad binay maw...


In [60]:
#s_bert.get_document_info(X_train) #Info on each document

In [61]:
topics_t, probs_t = s_bert.transform(X_test) #Fitting test set into different topics


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [62]:
#^Sparse efficiency warning

In [63]:
datapoints_t, features_t = np.shape(probs_t) #should have same features with train set
print("Number of rows: ", datapoints_t)
print("Number of Features(topics): ", features_t)

Number of rows:  4232
Number of Features(topics):  158


In [64]:
# Create and fit the pipeline
rf_pipeline = Pipeline([
    ("rf", RandomForestClassifier(n_estimators=511, random_state=0))
])

# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Best Bert-RF Model Accuracy: 0.6068052930056711
Best Bert-RF Model F1-Score: 0.5377777777777777


## Experiment 1: Multilanguage Bert

In [5]:
#Create Model
sm_bert = BERTopic(calculate_probabilities=True, language = "multilingual") #true for soft clustering multiple topics per documents, false for one topic per document

In [6]:
print("Fitting X_train into Bert")
topics, probs = sm_bert.fit_transform(X_train)
print("X_train Shape: ", np.shape(probs))

print("Using model to transform X_test")
topics_t, probs_t = sm_bert.transform(X_test) #Fitting test set into different topics
print("X_test Shape: ", np.shape(probs_t))

# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Fitting X_train into Bert
X_train Shape:  (10000, 172)
Using model to transform X_test


  self._set_arrayXarray(i, j, x)


X_test Shape:  (4232, 172)


NameError: name 'rf_pipeline' is not defined

In [8]:
# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Best Bert-RF Model Accuracy: 0.6198015122873346
Best Bert-RF Model F1-Score: 0.5984527077614175


## Experiment 2: Bert Reduced Outliers

In [None]:
## ber

In [None]:
#Create Model
sm_bert = BERTopic(calculate_probabilities=True, language = "multilingual") #true for soft clustering multiple topics per documents, false for one topic per document

In [None]:
print("Fitting X_train into Bert")
topics, probs = sm_bert.fit_transform(X_train)
print("X_train Shape: ", np.shape(probs))

print("Using model to transform X_test")
topics_t, probs_t = sm_bert.transform(X_test) #Fitting test set into different topics
print("X_test Shape: ", np.shape(probs_t))

# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

# Build your own Bert

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

In [None]:
# Experiment 3