<a href="https://colab.research.google.com/github/ibrahimreyad/TransClusModel/blob/main/TransClusModel-ver_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import DBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer

In [5]:

# Load Dataset
def load_data():
    data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    return data.data, data.target

# Preprocess the text (basic example)
def preprocess_text(documents):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=2)
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix

# Apply Embedding Model
def embed_text(documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(documents, show_progress_bar=True)
    return embeddings

# Apply Dimensionality Reduction (UMAP)
def reduce_dimensionality(embeddings):
    umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine')
    umap_embeddings = umap_model.fit_transform(embeddings)
    return umap_embeddings

# Cluster with DBSCAN
def cluster_text(umap_embeddings):
    dbscan = DBSCAN(eps=0.5, min_samples=10)
    clusters = dbscan.fit_predict(umap_embeddings)
    return clusters

# Topic Modeling with LDA
def topic_modeling(tfidf_matrix, n_topics=10):
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(tfidf_matrix)
    return lda

# Main function to run the model
def main():
    # Load and preprocess data
    print("Loading data...")
    documents, labels = load_data()

    print("Preprocessing text...")
    tfidf_matrix = preprocess_text(documents)

    print("Embedding text...")
    embeddings = embed_text(documents)

    print("Reducing dimensionality...")
    umap_embeddings = reduce_dimensionality(embeddings)

    print("Clustering text...")
    clusters = cluster_text(umap_embeddings)

    print("Performing topic modeling...")
    lda_model = topic_modeling(tfidf_matrix)

    # Display results
    print("Clusters:", np.unique(clusters))
    print("LDA Topics:", lda_model.components_)

if __name__ == "__main__":
    main()


Loading data...
Preprocessing text...
Embedding text...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/589 [00:00<?, ?it/s]

Reducing dimensionality...
Clustering text...
Performing topic modeling...
Clusters: [-1  0  1  2  3  4  5  6  7  8  9]
LDA Topics: [[ 0.10000613  0.10000436  0.1000001  ...  0.10000001  0.10002491
   0.1       ]
 [26.78691046  1.92471862  0.47715606 ...  0.10001423  1.3963724
   0.10000423]
 [ 0.10000225  0.10000345  0.10000005 ...  0.10000001  0.1
   0.1       ]
 ...
 [ 0.10000244  0.10000118  0.10000005 ...  0.10000001  0.10000215
   0.1       ]
 [ 0.1000079   0.10001142  0.10000004 ...  0.10459245  0.10000002
   0.3491185 ]
 [ 1.6612818   0.10017888  0.10000007 ...  0.10000001  0.1000041
   0.52478298]]
