In [14]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np

'''
Variables: 
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension) 
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters 

''';

In [15]:
from umap import UMAP
import numpy as np

def dim_red(mat, p, method):
    '''
    Perform dimensionality reduction using UMAP

    Input:
    -----
        mat : NxM list 
            Embeddings matrix (N samples, M dimensions)
        p : number of dimensions to keep 
        method : str
            Method name, should be 'UMAP'
    Output:
    ------
        red_mat : NxP list such that p << m
            Reduced matrix using UMAP
    '''
    if method == 'UMAP':
        # Perform UMAP using umap-learn library
        umap_model = UMAP(n_components=p)
        red_mat = umap_model.fit_transform(mat)
        
    else:
        raise Exception("Please select 'UMAP' as the method.")
    
    return red_mat

ImportError: cannot import name 'UMAP' from 'umap' (C:\Users\ABIR°S ELITEBOOK\AppData\Roaming\Python\Python311\site-packages\umap\__init__.py)

In [None]:
def dim_red(mat, p):
    '''
    Perform dimensionality reduction

    Input:
    -----
        mat : NxM list 
        p : number of dimensions to keep 
    Output:
    ------
        red_mat : NxP list such that p<<m
    '''
    
    red_mat = mat[:,:p]
    
    return red_mat

In [None]:
def clust(mat, k):
    '''
    Perform clustering

    Input:
    -----
        mat : input list 
        k : number of cluster
    Output:
    ------
        pred : list of predicted labels
    '''
    
    pred = np.random.randint(k, size=len(corpus))
    
    return pred

In [5]:
# import data
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))

# embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

# perform dimentionality reduction
red_emb = dim_red(embeddings, 20)

# perform clustering
pred = clust(red_emb, k)

# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')


.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

NameError: name 'dim_red' is not defined

In [7]:
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np

# Chargement des données
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]

# Embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

# ACP
def acp_approach(embeddings, num_dimensions):
    pca = PCA(n_components=num_dimensions)
    red_emb = pca.fit_transform(embeddings)
    return red_emb

# Clustering
def perform_clustering(mat, k):
    pred = np.random.randint(k, size=len(corpus))
    return pred

# Utilisation de l'approche ACP
num_dimensions_acp = 20
red_emb_acp = acp_approach(embeddings, num_dimensions_acp)

# Clustering
pred_acp = perform_clustering(red_emb_acp, len(set(labels)))

# Évaluation des résultats
nmi_score_acp = normalized_mutual_info_score(pred_acp, labels)
ari_score_acp = adjusted_rand_score(pred_acp, labels)

# Affichage des résultats
print(f'ACP - NMI: {nmi_score_acp:.2f}, ARI: {ari_score_acp:.2f}')

ACP - NMI: 0.03, ARI: 0.00


In [1]:
from umap import UMAP
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
import numpy as np

def umap_approach(embeddings, num_dimensions):
    umap_model = UMAP(n_components=num_dimensions)
    red_emb = umap_model.fit_transform(embeddings)
    return red_emb

def main():
    ng20 = fetch_20newsgroups(subset='test')
    corpus = ng20.data[:2000]
    labels = ng20.target[:2000]

    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode(corpus)

    num_dimensions_umap = 20
    red_emb_umap = umap_approach(embeddings, num_dimensions_umap)

    pred_umap = perform_clustering(red_emb_umap, len(set(labels)))

    nmi_score_umap, ari_score_umap = evaluate_clustering(pred_umap, labels)

    print(f'UMAP - NMI: {nmi_score_umap:.2f}, ARI: {ari_score_umap:.2f}')

if _name_ == "_main_":
    main()

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()





NameError: name '_name_' is not defined

In [10]:
from umap import UMAP
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

from sentence_transformers import SentenceTransformer
import numpy as np

def perform_clustering(embeddings, num_clusters):
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    pred = kmeans_model.fit_predict(embeddings)
    return pred

def evaluate_clustering(pred, true_labels):
    nmi_score = normalized_mutual_info_score(true_labels, pred)
    ari_score = adjusted_rand_score(true_labels, pred)
    return nmi_score, ari_score

def umap_approach(embeddings, num_dimensions):
    umap_model = UMAP(n_components=num_dimensions)
    red_emb = umap_model.fit_transform(embeddings)
    return red_emb

def main():
    ng20 = fetch_20newsgroups(subset='test')
    corpus = ng20.data[:2000]
    labels = ng20.target[:2000]

    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode(corpus)

    num_dimensions_umap = 20
    red_emb_umap = umap_approach(embeddings, num_dimensions_umap)

    pred_umap = perform_clustering(red_emb_umap, len(set(labels)))

    nmi_score_umap, ari_score_umap = evaluate_clustering(pred_umap, labels)

    print(f'UMAP - NMI: {nmi_score_umap:.2f}, ARI: {ari_score_umap:.2f}')

if __name__ == "__main__":
    main()

SyntaxError: invalid syntax (529843098.py, line 1)

In [3]:
from umap.umap_ import UMAP
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer

# Chargement des données
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]

# Embedding
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)

# UMAP
def umap_approach(embeddings, num_dimensions):
    umap_model = UMAP(n_components=num_dimensions)
    red_emb = umap_model.fit_transform(embeddings)
    return red_emb

# Clustering
def perform_clustering(embeddings, num_clusters):
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    pred = kmeans_model.fit_predict(embeddings)
    return pred

# Utilisation de l'approche UMAP
num_dimensions_umap = 20
red_emb_umap = umap_approach(embeddings, num_dimensions_umap)

# Clustering
pred_umap = perform_clustering(red_emb_umap, len(set(labels)))

# Évaluation des résultats
nmi_score_umap = normalized_mutual_info_score(labels, pred_umap)
ari_score_umap = adjusted_rand_score(labels, pred_umap)

# Affichage des résultats
print(f'UMAP - NMI: {nmi_score_umap:.2f}, ARI: {ari_score_umap:.2f}')



  super()._check_params_vs_input(X, default_n_init=10)


UMAP - NMI: 0.48, ARI: 0.30
