In [11]:
import numpy as np
import umap

from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from dataset.dataset import Dataset
from sklearn.preprocessing import StandardScaler
from bertopic import BERTopic
from sklearn.ensemble import RandomForestClassifier
from constants import CLEANED_DATASET_PATH

In [12]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[8674, 3859, 3976])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

print("Shape X_train",np.shape(X_train))
print("Shape X_test",np.shape(X_test))
print("Shape X_val",np.shape(X_val))

Data loaded from dataset/cleaned_dataset.pkl
Shape X_train (8674,)
Shape X_test (3976,)
Shape X_val (3859,)


In [27]:
print(len(np.unique(np.concatenate((X_train,X_test,X_val)))))
print(len(np.unique(X_val)) + len(np.unique(X_test)) + len(np.unique(X_train)))
print(type(X_train))

16509
16509
<class 'list'>


## Baseline Bertopic

In [25]:
#Create Model
s_bert = BERTopic(calculate_probabilities=True) #true for soft clustering multiple topics per documents, false for one topic per document

In [28]:
#train model on X
topics, probs = s_bert.fit_transform(X_train) 

In [29]:
datapoints, features = np.shape(probs)
print("Number of rows: ", datapoints)
print("Number of Features(topics): ", features)

Number of rows:  8674
Number of Features(topics):  106


In [None]:
s_bert.get_topic_info() #Hard clustering name = topics, representation = keywords, representative docs = docs with that topic

In [None]:
#s_bert.get_document_info(X_train) #Info on each document

In [30]:
topics_t, probs_t = s_bert.transform(X_test) #Fitting test set into different topics

In [None]:
#^Sparse efficiency warning

In [31]:
datapoints_t, features_t = np.shape(probs_t) #should have same features with train set
print("Number of rows: ", datapoints_t)
print("Number of Features(topics): ", features_t)

Number of rows:  3976
Number of Features(topics):  106


In [32]:
# Create and fit the pipeline
rf_pipeline = Pipeline([
    ("rf", RandomForestClassifier(n_estimators=511, random_state=0))
])

# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Best Bert-RF Model Accuracy: 0.5995975855130785
Best Bert-RF Model F1-Score: 0.5743315508021389


## Experiment 1: Multilanguage Bert

In [33]:
#Create Model
sm_bert = BERTopic(calculate_probabilities=True, language = "multilingual") #true for soft clustering multiple topics per documents, false for one topic per document

In [34]:
print("Fitting X_train into Bert")
topics, probs = sm_bert.fit_transform(X_train)
print("X_train Shape: ", np.shape(probs))

print("Using model to transform X_test")
topics_t, probs_t = sm_bert.transform(X_test) #Fitting test set into different topics
print("X_test Shape: ", np.shape(probs_t))

# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Fitting X_train into Bert
X_train Shape:  (8674, 115)
Using model to transform X_test
X_test Shape:  (3976, 115)
Best Bert-RF Model Accuracy: 0.5995975855130785
Best Bert-RF Model F1-Score: 0.5708894878706199


In [35]:
# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Best Bert-RF Model Accuracy: 0.5995975855130785
Best Bert-RF Model F1-Score: 0.5708894878706199


## Experiment 2: Bert Reduced Outliers

In [36]:
## ber

In [37]:
#Create Model
sm_bert = BERTopic(calculate_probabilities=True, language = "multilingual") #true for soft clustering multiple topics per documents, false for one topic per document

In [38]:
print("Fitting X_train into Bert")
topics, probs = sm_bert.fit_transform(X_train)
print("X_train Shape: ", np.shape(probs))

print("Using model to transform X_test")
topics_t, probs_t = sm_bert.transform(X_test) #Fitting test set into different topics
print("X_test Shape: ", np.shape(probs_t))

# Fit the pipeline
rf_pipeline.fit(probs, Y_train)

# Evaluate the pipeline
Y_pred = rf_pipeline.predict(probs_t)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Bert-RF Model Accuracy: {accuracy}")
print(f"Best Bert-RF Model F1-Score: {f1}")

Fitting X_train into Bert
X_train Shape:  (8674, 128)
Using model to transform X_test
X_test Shape:  (3976, 128)
Best Bert-RF Model Accuracy: 0.5870221327967807
Best Bert-RF Model F1-Score: 0.5484048404840484


# Build your own Bert

In [39]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

In [40]:
# Experiment 3