In [1]:
import os

import cohere
import pandas as pd
import sklearn.cluster
import umap
import seaborn as sns
from annoy import AnnoyIndex
import numpy as np
from dotenv import load_dotenv
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from main import *

In [3]:
np.random.seed(42)
key = get_key()
co = cohere.Client(key)

# Get dataframe
df = getDataFrame('data_100.csv')

# Get vectors using coheres embeddings
embeddings = getEmbeddings(co, df)

# Save embeddings as Annoy
indexfile = 'index.ann'
#saveBuild(embeddings, indexfile)

# Get query embeddings and append to embeddings
query = 'Celestial bodies and physics'
query_embed = get_query_embed(co, query)

# Get nearest points
num_nearest = 10
nearest_ids = get_query_nn(indexfile, query_embed, num_nearest)
df = df.loc[nearest_ids[0]]
nn_embeddings = embeddings[nearest_ids[0]]

df.loc[(num_nearest+1)] = ['Query', query, '']
all_embeddings = np.vstack([nn_embeddings, query_embed])

# Cluster them using dendrograms & Plot them
model = fitModel(nn_embeddings)
dendroData = plotDendrogram(model)

In [36]:
model.labels_

array([7, 9, 8, 6, 3, 5, 4, 2, 1, 0], dtype=int64)

In [37]:
model.children_

array([[ 8,  9],
       [ 2,  6],
       [ 0,  1],
       [ 3,  4],
       [ 7, 12],
       [10, 11],
       [13, 14],
       [ 5, 15],
       [16, 17]], dtype=int64)

In [38]:
# level 0 = show each doc as own cluster, level n = 1 cluster
def get_clusters(cluster_dict, cluster_combine_order, level=count):
    clusters = cluster_dict.copy()
    n = len(clusters)
    for i in range(level):
        values = cluster_combine_order[0]
        cluster_combine_order = np.delete(cluster_combine_order, 0, axis=0)
        clusters[n] = clusters[values[0]] + clusters[values[1]]
        clusters.pop(values[0])
        clusters.pop(values[1])
        n+=1
    return clusters  

In [39]:
import copy

cluster_combine_order = copy.deepcopy(model.children_)

clusters = dict()
n = len(clusters)
for cluster in model.labels_:
    clusters[cluster] = [cluster]
    
print(get_clusters(clusters, cluster_combine_order, 8))

{16: [3, 4, 7, 0, 1], 17: [5, 8, 9, 2, 6]}


In [23]:
# Map the nearest embeddings to 2d
reducer = umap.UMAP()
umap_embeds = reducer.fit_transform(all_embeddings)

  warn(


In [25]:
umap_embeds[0]

array([ 9.110998 , -6.9049573], dtype=float32)

In [26]:
linkages

Unnamed: 0,item1,item2,dist,num_points
0,8.0,9.0,70.142855,2.0
1,2.0,6.0,80.084881,2.0
2,0.0,1.0,82.073211,2.0
3,3.0,4.0,87.078755,2.0
4,7.0,12.0,88.771337,3.0
5,10.0,11.0,89.000822,4.0
6,13.0,14.0,103.433034,5.0
7,5.0,15.0,111.71416,5.0
8,16.0,17.0,122.093246,10.0


In [5]:
linkages = pd.DataFrame(dendroData)

In [7]:
linkages.columns = ['item1', 'item2', 'dist', 'num_points']

In [15]:
uniq_points = linkages.num_points.unique()

In [22]:
clusters = []
for index, r in linkages[linkages['num_points']==uniq_points[0]].iterrows():
    cluster = []
    umapint[r.item1]
    umapint[r.item2]

item1          8.000000
item2          9.000000
dist          70.142855
num_points     2.000000
Name: 0, dtype: float64
item1          2.000000
item2          6.000000
dist          80.084881
num_points     2.000000
Name: 1, dtype: float64
item1          0.000000
item2          1.000000
dist          82.073211
num_points     2.000000
Name: 2, dtype: float64
item1          3.000000
item2          4.000000
dist          87.078755
num_points     2.000000
Name: 3, dtype: float64
