<a href="https://colab.research.google.com/github/kiarashgh98/Topic-modeling-/blob/main/3_Best_dim_reduc_and_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Imports and Downloads 📢**

##### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##### Import Packages

In [None]:
!pip uninstall umap
!pip install umap-learn



In [None]:
!pip install sentence_transformers



In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score
from collections import Counter
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from umap.umap_ import UMAP

  from tqdm.autonotebook import tqdm, trange


# **Dataset 🗂️**

### Open CSV File

In [None]:
org_df = pd.read_csv('/content/drive/MyDrive/New Codes/data_after_preprocessing.csv')
df = org_df

In [None]:
data = df['cleaned_text']

# **Find optimal dim reduc method and topic model using sil score 🧠**

In [None]:
def sil_score(di_reduc_embeddings, name, num_clusters = 14):
    # Initialize KMeans clustering algorithm
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)

    # Fit clustering algorithm and obtain cluster labels
    cluster_labels = kmeans.fit_predict(di_reduc_embeddings)

    # Calculate Silhouette Score
    silhouette_avg = silhouette_score(di_reduc_embeddings, cluster_labels)
    print(f"{name} : silhouette_score = {silhouette_avg}")


In [None]:
# Encode data using distilbert
model_distilbert = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings_distilbert = model_distilbert.encode(data, show_progress_bar=True)

# Encode data using scibert
model_scilbert = SentenceTransformer('allenai/scibert_scivocab_uncased')
embeddings_scilbert = model_scilbert.encode(data, show_progress_bar=True)

# Encode data using bert
model_bert = SentenceTransformer('bert-base-uncased')
embeddings_bert = model_bert.encode(data, show_progress_bar=True)

# Fit LDA model using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data)



Batches:   0%|          | 0/529 [00:00<?, ?it/s]



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Batches:   0%|          | 0/529 [00:00<?, ?it/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Batches:   0%|          | 0/529 [00:00<?, ?it/s]

In [None]:
lda_model = LatentDirichletAllocation(n_components=14, learning_method='online', random_state=42, max_iter=1)
lda_embeddings = lda_model.fit_transform(tfidf)

In [None]:
# UMAP embeddings
umap_embeddings_distilbert = UMAP(n_neighbors=15, n_components=2, metric='cosine').fit_transform(embeddings_distilbert)
sil_score(umap_embeddings_distilbert, 'distilbert/umap')

umap_embeddings_scilbert = UMAP(n_neighbors=15, n_components=2, metric='cosine').fit_transform(embeddings_scilbert)
sil_score(umap_embeddings_scilbert, 'scilbert/umap')

umap_embeddings_bert = UMAP(n_neighbors=15, n_components=2, metric='cosine').fit_transform(embeddings_bert)
sil_score(umap_embeddings_bert, 'bert/umap')

umap_embeddings_lda = UMAP(n_neighbors=15, n_components=2, metric='cosine').fit_transform(lda_embeddings)
sil_score(umap_embeddings_lda, 'lda/umap')



distilbert/umap : silhouette_score = 0.42888474464416504




scilbert/umap : silhouette_score = 0.3913472890853882




bert/umap : silhouette_score = 0.3567882478237152




lda/umap : silhouette_score = 0.4804850220680237


In [None]:
# t-SNE embeddings
t_sne_embeddings_distilbert = TSNE(n_components=2, metric='cosine').fit_transform(embeddings_distilbert)
sil_score(t_sne_embeddings_distilbert, 'distilbert/t-sne')

t_sne_embeddings_scilbert = TSNE(n_components=2, metric='cosine').fit_transform(embeddings_scilbert)
sil_score(t_sne_embeddings_scilbert, 'scilbert/t-sne')

t_sne_embeddings_bert = TSNE(n_components=2, metric='cosine').fit_transform(embeddings_bert)
sil_score(t_sne_embeddings_bert, 'bert/t-sne')

t_sne_embeddings_lda = TSNE(n_components=2, metric='cosine').fit_transform(lda_embeddings)
sil_score(t_sne_embeddings_lda, 'lda/t-sne')



distilbert/t-sne : silhouette_score = 0.3735552728176117




scilbert/t-sne : silhouette_score = 0.35232967138290405




bert/t-sne : silhouette_score = 0.35222572088241577




lda/t-sne : silhouette_score = 0.3926728069782257


In [None]:
# PCA embeddings
pca_embeddings_distilbert = PCA(n_components=2).fit_transform(embeddings_distilbert)
sil_score(pca_embeddings_distilbert, 'distilbert/pca')

pca_embeddings_scilbert = PCA(n_components=2).fit_transform(embeddings_scilbert)
sil_score(pca_embeddings_scilbert, 'scilbert/pca')

pca_embeddings_bert = PCA(n_components=2).fit_transform(embeddings_bert)
sil_score(pca_embeddings_bert, 'bert/pca')

pca_embeddings_lda = PCA(n_components=2).fit_transform(lda_embeddings)
sil_score(pca_embeddings_lda, 'lda/pca')



distilbert/pca : silhouette_score = 0.3200520873069763




scilbert/pca : silhouette_score = 0.31985047459602356




bert/pca : silhouette_score = 0.3174653947353363




lda/pca : silhouette_score = 0.5610751804829914
