In [4]:
!pip install transformers sentence-transformers > /dev/null

## K-Means Clustering

 K-Means requires that the number of clusters is specified beforehand.

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [None]:
embed_minilm = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
data_corp = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]

In [5]:
data_embed = embed_minilm.encode(data_corp)

In [6]:
#starting the clustering process

num_cluster = 5

kmeans_model = KMeans(n_clusters=num_cluster)

In [7]:
kmeans_model.fit(data_embed)



In [8]:
clusters = kmeans_model.labels_

In [9]:
clusters

array([0, 0, 0, 1, 1, 3, 3, 4, 4, 2, 2], dtype=int32)

In [24]:
import pandas as pd

def make_df(cluster_list, text_list):
  clustered_dataset = []
  for ind, cluster in enumerate(cluster_list):
    cluster_dict = {'cluster':cluster,
                    'sentence':text_list[ind]}
    clustered_dataset.append(cluster_dict)

  cluster_dict = pd.DataFrame(clustered_dataset)

  return cluster_dict

In [10]:
clustered_dataset = []
for ind, cluster in enumerate(clusters):
  cluster_dict = {'cluster':cluster,
                  'sentence':data_corp[ind]}
  clustered_dataset.append(cluster_dict)

In [11]:
import pandas as pd

cluster_dict = pd.DataFrame(clustered_dataset)

cluster_dict

Unnamed: 0,cluster,sentence
0,0,A man is eating food.
1,0,A man is eating a piece of bread.
2,0,A man is eating pasta.
3,1,The girl is carrying a baby.
4,1,The baby is carried by the woman
5,3,A man is riding a horse.
6,3,A man is riding a white horse on an enclosed g...
7,4,A monkey is playing drums.
8,4,Someone in a gorilla costume is playing a set ...
9,2,A cheetah is running behind its prey.


## Agglomerative Clustering

 Clusters below that threshold are merged. This algorithm can be useful if the number of clusters is unknown.

In [13]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [18]:
np.linalg.norm(data_embed, axis=1, keepdims=True)

array([[0.99999994],
       [0.99999994],
       [1.        ],
       [1.        ],
       [1.        ],
       [0.9999999 ],
       [1.        ],
       [1.        ],
       [1.        ],
       [1.        ],
       [0.99999994]], dtype=float32)

In [19]:
normalized_embed = data_embed / np.linalg.norm(data_embed, axis=1, keepdims=True)

In [20]:
agglomerate_model = AgglomerativeClustering(n_clusters=None,
                                            distance_threshold = 1.5)

In [21]:
agglomerate_model.fit(normalized_embed)

In [22]:
agglo_clusters = agglomerate_model.labels_

In [23]:
agglo_clusters

array([0, 0, 0, 4, 4, 1, 1, 2, 2, 3, 3])

In [25]:
agglo_df = make_df(agglo_clusters,data_corp)
#agglo_df

Unnamed: 0,cluster,sentence
0,0,A man is eating food.
1,0,A man is eating a piece of bread.
2,0,A man is eating pasta.
3,4,The girl is carrying a baby.
4,4,The baby is carried by the woman
5,1,A man is riding a horse.
6,1,A man is riding a white horse on an enclosed g...
7,2,A monkey is playing drums.
8,2,Someone in a gorilla costume is playing a set ...
9,3,A cheetah is running behind its prey.


## Fast Clustering for Massive Dataset

In [3]:
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 50000  # We limit our corpus to only the first 50k questions

In [5]:
from sentence_transformers import util
import os

if not os.path.exists(dataset_path):
    print("Download dataset")
    util.http_get(url, dataset_path)

In [6]:
import csv

# Get all unique sentences from the file upto max_corpus_size

corpus_sentences = set()

with open(dataset_path, encoding='utf8') as fIn:

    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.add(row['question1'])
        corpus_sentences.add(row['question2'])
        if len(corpus_sentences) >= max_corpus_size:
            break

In [7]:
corpus_sentences = list(corpus_sentences)

In [32]:
quora_embed = embed_minilm.encode(corpus_sentences,
                                  batch_size=64,
                                  show_progress_bar=True,
                                  convert_to_tensor=True)

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

In [33]:
import time

start = time.time()

clusters_community = util.community_detection(quora_embed,
                                              min_community_size=25,
                                              threshold=0.75)

complete = time.time()

print(f"Clustering completed in {complete - start} time")

Clustering completed in 51.11092782020569 time


In [36]:
print(f"There are {len(clusters_community)} clusters")

There are 61 clusters


In [40]:
quora_clustered = []

for ind, sentences in enumerate(clusters_community):
  for sentence in sentences:
    quora_dict = {"cluster":ind,
                  "sentence_id":sentence,
                  "sentence":corpus_sentences[sentence]}
    quora_clustered.append(quora_dict)

clustered_df = pd.DataFrame(quora_clustered)

In [41]:
clustered_df

Unnamed: 0,cluster,sentence_id,sentence
0,0,859,How can I improve my English speaking ability?
1,0,1010,How can I learn to speak English fluently?
2,0,1115,How do I improve my overall native English lan...
3,0,1232,How can I learn to speak English?
4,0,2192,How can I improve my english language skills? ...
...,...,...,...
2279,60,45292,What is the step by step guide to invest in sh...
2280,60,45399,How do I earn money from the stock market?
2281,60,47162,What all does somebody need to know to start i...
2282,60,47419,How do I make a profit in the stock market?


In [42]:
!pip install bertopic > /dev/null

In [2]:
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all',  remove=('headers',
                                                 'footers',
                                                 'quotes'))['data']

In [None]:
len(docs)

In [8]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/589 [00:00<?, ?it/s]

KeyboardInterrupt: ignored