[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/juanhuguet/intro_to_nlp/blob/main/notebooks/06-text-clustering-with-embeddings.ipynb)

In [None]:
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    
try:
    from datasets import load_dataset
except:
    install("datasets")
    from datasets import load_dataset

In [None]:
import pandas as pd

### Load the dataset

In [None]:
dataset = load_dataset("yelp_review_full")

### Convert it to pandas a random sample of 1000 articles

In [None]:
df = pd.DataFrame(dataset["train"].shuffle(42).select(range(...)))

## Embedd sentences

In [None]:
try:
    from sentence_transformers import SentenceTransformer
except:
    install("sentence-transformers")
    from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer("distiluse-base-multilingual-cased-v2", device="...")

In [None]:
sentence_embedding = model.encode("...")

In [None]:
sentence_embedding.shape

## Get sentence embeddings for all texts

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
## use the encode method of sentence transformers
df["embedding"] = df["text"].progress_apply()

## Let's reduce the dimensionality and visualize the data in 2d

In [None]:
try:
    import umap
except:
    install("umap-learn")
    import umap

In [None]:
# reduce it to 2 dimensions

def reduce_dimensions(embeddings,
                      n_components=..,
                      n_neighbors=8,
                      random_state = 42):
    
    umap_embeddings = (umap.UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine',
                                 n_epochs=10,
                                random_state=random_state)
                            .fit_transform(embeddings))
    df = pd.DataFrame(umap_embeddings, index=embeddings.index)
    return df

In [None]:
#explode the embeddings
embeddings = df["embedding"].apply(pd.Series)

In [None]:
embeddings_2d = reduce_dimensions(embeddings)

In [None]:
embeddings_2d

### Let's visualize them...

In [None]:
import seaborn as sns

In [None]:
sns.scatterplot(embeddings_2d, x=0, y=1)

Nice, let's see if we can cluster them....

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
clst = AgglomerativeClustering(n_clusters=10)

In [None]:
clusters = clst.fit_predict(embeddings)

## Let's assign the clusters to the visualization

In [None]:
sns.scatterplot(embeddings_2d, x=0, y=1, c=clusters)

## Now, let's review the contents of the clusters...

In [None]:
df["cluster"] = clusters

In [None]:
df.sort_values(by="cluster")

In [None]:
df.loc[df["cluster"] == 9, "text"]