In [34]:
import pandas as pd
import numpy as np
import sqlite3
import json

from sklearn.cluster import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 300

In [79]:
with open("koso-dogfood-export-2025-2-30-16-23.json", "r") as fp:
    koso_data = json.load(fp)

In [87]:
koso_data["graph"].values()



In [81]:
kdf = pd.DataFrame(koso_data["graph"].values())
kdf.head()

Unnamed: 0,id,num,name,children,assignee,reporter,status,statusTime,url,kind
0,955b2543-2c2c-402a-bf3c-a6ee3af600c4,362,Update all dependencies,[],,,Done,1740170000000.0,https://github.com/kosolabs/koso/pull/658,github_pr
1,17eebd97-1408-4ef8-a09f-5dd6274b6a70,169,circular-progress,[],shadanan@gmail.com,shadanan@gmail.com,Done,,,
2,86693605-0eb8-4f63-b415-bf98daff8dd9,173,chip,[],shadanan@gmail.com,shadanan@gmail.com,Done,,,
3,37efb8eb-0be0-4b7c-a8b2-fa5a64a24294,218,Avoid flash of default content when loading project,[],leonhard.kyle@gmail.com,leonhard.kyle@gmail.com,Done,1729898000000.0,,
4,3b294b94-360d-4434-a60c-2ba597487716,4,Implement project sharing,[],leonhard.kyle@gmail.com,leonhard.kyle@gmail.com,Done,,,


In [82]:
kdf.shape

(636, 10)

In [35]:
db_file = "koso-tasks-emebeddings.sqlite"
db = sqlite3.connect(db_file, autocommit=True)

df = pd.read_sql("SELECT * FROM embeddings", db)
df["embedding"] = df["embedding"].apply(json.loads)

In [36]:
df.head()

Unnamed: 0,id,name,embedding
0,955b2543-2c2c-402a-bf3c-a6ee3af600c4,Update all dependencies,"[-0.04262385, -0.0137202265, 0.050563354, -0.030582674, 0.0036159514, 0.030054972, 0.045718096, 0.052002538, -0.037226908, -0.0029698175, 0.011183664, 0.009096846, -0.054305233, 0.01088983, 0.011303596, -0.016670555, 0.008053437, -0.004089683, -0.017689977, 0.05852684, -0.015591167, -0.03408469, 0.031086387, -0.003304128, -0.013108573, 0.02345871, 0.024825934, 0.016610589, 0.032021858, -0.020736251, -0.006638239, -0.015615153, -0.012221076, 0.031925913, -0.010266184, -0.02432222, 0.011405538, -0.008089417, -0.0061165346, -0.023002967, 0.07488557, -0.036075562, 0.008892962, -0.035523873, -0.021827633, 0.015507214, -0.03019889, 0.005115102, -0.0075797057, 0.0037358834, -0.030174904, -0.03113436, 0.030846523, 0.015327316, -0.0058586807, -0.023626614, 0.015291337, -0.029839095, 0.019488959, -0.0047732955, 0.03401273, -0.008227339, -0.013648268, -0.043079592, 0.008215345, 0.0041256626, 0.008743047, -0.040728923, 0.019752808, -0.041376557, -0.022295369, 0.031038415, -0.051618755, -0.034900226, -0.035907656, 0.03708299, 0.034756307, 0.026840793, -0.019656863, 0.012928675, -0.024154315, -0.009432656, -0.027752277, -0.043127567, -0.036099546, -0.03523604, -0.06514709, 0.00019376523, -0.0169464, 0.014787622, -0.016610589, 0.0053519676, -0.06394777, 0.052146457, -0.04370324, -0.020232538, -0.0016040911, -0.039601564, 0.015855018, 0.054545097, ...]"
1,3b294b94-360d-4434-a60c-2ba597487716,Implement project sharing,"[-0.008109256, -0.041775838, 0.06633781, 0.016394163, 0.010597655, -0.02224922, -0.005269552, 0.031529486, -0.03668194, -0.04505467, 0.020858644, -0.019716907, 0.001230477, -0.015369527, -0.0054671606, -0.022117482, -0.011614972, -0.05843348, 0.005712341, 0.037267443, 0.03624281, 0.025308488, 0.031675864, 0.032729775, -0.044352066, -0.04493757, -0.020565892, 0.018560534, 0.018355606, -0.009594976, -0.010400047, -0.035335276, 0.022732263, 0.027694425, 0.01952662, 0.013883807, -0.0068833525, 0.025703704, -0.0059209275, 0.0008457814, 0.03425209, -0.02728457, 0.0037801717, 0.025601242, -0.012807939, 0.033344556, 0.0056464714, -0.02927529, -0.0067040413, 0.0703778, 0.011329537, -0.04935814, -0.0054415446, -0.050324224, -0.024561968, -0.007714039, 0.024693707, -0.03126601, -0.07939459, -0.0046767276, -0.0052183205, 0.0054122694, 0.01970227, 0.00016238638, -0.016525902, 0.017872564, -0.043649457, -0.0067845485, 0.01820923, 0.005265893, -0.03524745, 0.03758947, -0.015794018, -0.035715856, 0.0025213344, 0.008921645, -0.056354932, 0.027050368, 0.03846773, -0.0090167895, 0.05333958, -0.0021407555, -0.013510547, 0.00064908806, -0.057233192, -0.05369088, -0.09239282, 0.00031425196, -0.02797254, -0.008650849, -0.043356705, 0.006385673, -0.018794736, 0.0015021883, 0.04830423, -0.0022981102, -0.035393827, -0.010604974, -0.0068174833, 0.05626711, ...]"
2,37efb8eb-0be0-4b7c-a8b2-fa5a64a24294,Avoid flash of default content when loading project,"[-0.011238622, -0.01843505, 0.010218133, 0.00065975543, -0.023272429, -0.03082669, 0.028891739, -0.043364115, 0.02066157, 0.009953071, 0.012994656, 0.025525454, 0.0129217645, 0.009774155, 0.030588135, -0.017202513, -0.01225911, -0.03249658, 0.00840246, 0.024942318, 0.0071102837, -0.038858064, -0.03082669, 0.044397857, -0.026161602, -0.030720666, -0.012318748, 0.05118344, 0.08179808, 0.009575359, 0.02096639, -0.037241187, 0.038699027, 0.0054702135, 0.018541075, -0.022053145, 0.02777848, 0.019906143, 0.0046087625, 0.008733788, -0.0027914324, -0.028573666, 0.022782065, 0.008051253, 0.046465337, 0.019601323, 0.004953343, 0.045166537, -0.030349579, 0.088689685, 0.016937451, -0.0071235364, 0.005914192, 0.01461816, -0.040448435, -0.018289266, -0.013531406, 0.0068187155, -0.05783649, -0.003972614, 0.058631677, 0.02722185, -0.011954288, 0.034378517, 0.024557978, 0.0037572514, -0.052853327, 0.0016856275, 0.02353749, 0.007786191, -0.039547224, -0.0445834, -0.015850697, -0.028388122, -0.05205814, 0.0006875041, 0.012908511, -0.0060301567, 0.01881939, -0.034139965, 0.045590635, 0.012378387, -0.07193778, -0.056458168, -0.03639299, 0.023351947, -0.07988963, 0.0056425035, -0.014207314, -0.04389424, -0.06281965, 0.01805071, -0.029421862, -0.026824256, -0.037612274, 0.013266345, 0.009946445, 0.01594347, 0.041482177, 0.078776374, ...]"
3,17eebd97-1408-4ef8-a09f-5dd6274b6a70,circular-progress,"[0.009646903, -0.01339253, 0.03883222, 0.026204105, -0.010189636, 0.06525036, 0.024675278, 0.013079121, 0.027702356, 0.0031340967, 0.03501015, -0.038037233, -0.018162472, -0.028053988, 0.038801644, 0.06573959, -0.02865023, -0.042868327, 0.009043016, 0.044733495, 0.04895306, 0.0077052913, 0.04036105, 0.06372154, -0.0199512, -0.006065624, -0.003273602, 0.081639394, 0.013216714, 0.0029219717, -0.03858761, -0.043418705, -0.022336172, -0.009058304, 0.0243848, 0.021801082, -0.0011819749, 0.06867494, -0.009081236, 0.03635552, -0.014332759, -0.03568284, 0.02342164, -0.007579163, 0.027534185, 0.017321616, -0.057331037, 0.009700411, -0.009104169, 0.07295565, 0.011649666, 0.00892071, 0.021846946, 0.04461119, -0.011466207, -0.012521098, -0.0057866126, -0.011458563, -0.022641936, -0.002986947, 0.058126025, -0.036416676, 0.078704044, -0.0049610455, 0.005740748, 0.022366747, -0.056352586, 0.01906448, 0.01019728, 0.03406228, 0.031891346, 0.017856706, -0.017000563, 0.026402853, 0.04026932, 0.020853208, 0.05393704, -0.023452215, 0.05384531, 0.03378709, -0.016083267, -0.012536387, -0.0067612403, -0.026815636, 0.013316088, -0.027197843, -0.013056188, -0.027732933, 0.008729606, -0.00020937771, 0.012261198, 0.020318119, -0.042287372, 0.03705878, 0.023299333, -0.005759858, -0.03192192, 0.034551505, 0.059379663, 0.037914924, ...]"
4,86693605-0eb8-4f63-b415-bf98daff8dd9,chip,"[0.024372576, -0.035102338, -0.039083514, -0.044132814, 0.023353007, -0.029106293, 0.03138819, -0.0077256695, -0.033233125, -0.056950264, 0.046414707, 0.0048126127, -0.023340868, 0.02381424, -0.020075817, -0.02290391, -4.9641447e-05, -0.053891554, 0.025489248, -0.01437108, 0.012149875, 0.052871983, 0.046317603, -0.00019818649, 0.047992613, 0.0046335813, 0.022685431, 0.0044818595, 0.013873434, -0.0087634465, 0.001614319, -0.033063196, 0.005422534, -0.012410836, -0.03867083, -0.009904394, -0.037044372, 0.0037414574, 0.0192019, 0.012052773, -0.011257751, 0.0060688686, -0.0017311447, 0.009528124, -0.024639606, -0.044763975, -0.06816553, -0.032213554, -0.020694843, 0.08763446, -0.012125599, 0.0013525991, -0.04418136, 0.035515018, -0.01356999, 0.032383483, -0.0179153, 0.01802454, 0.0070520253, 0.018995559, -0.020257883, -0.0034926338, -0.004099521, -0.015803333, -0.032043625, -0.02665447, -0.029470425, 0.05170676, -0.01614319, 0.03410704, -0.0022014822, 0.018121641, -0.04340455, -0.010990721, 0.016385945, 0.040078808, -0.024202649, -0.027649766, 0.054522716, 0.083847485, -0.027601214, 0.0112395445, 0.037845463, -0.020051543, 0.018752804, -0.050881393, -0.052580677, 0.023122389, 0.009394608, -0.029543253, 0.004415102, 0.023292318, -0.0062266593, 0.026630195, 0.012562558, 0.02842658, -0.04437557, -0.006469414, 0.0061811428, 0.003717182, ...]"


In [37]:
df.shape

(636, 3)

In [38]:
duplicated_ids = df["id"].value_counts()[lambda s : s > 1].index

In [40]:
df["id"].nunique()

636

In [41]:
df["name"].nunique()

609

In [42]:
df["name"].value_counts()[lambda s : s > 1]

name
Update all dependencies                                         18
Bump the cargo-deps group across 1 directory with 2 updates      3
Bump the docker-deps group with 2 updates                        3
Lock file maintenance                                            3
Bump the npm-deps group across 1 directory with 12 updates       2
Upgrade bits-ui                                                  2
Fix staggered spacing when task name wraps                       2
Fix dialog box not triggering onSelect when clicking outside     2
Name: count, dtype: int64

In [43]:
df = df.drop_duplicates(subset=["id", "name"])
df.shape

(636, 3)

In [44]:
sdf = df
embeddings = list(sdf["embedding"].values)
X = np.array(embeddings)

In [45]:
model = HDBSCAN(
    min_cluster_size=2,
    store_centers="centroid",
).fit(X)

In [46]:
sdf["cluster"] = model.labels_

In [48]:
sdf.drop(columns=["embedding"]).head()

Unnamed: 0,id,name,cluster
0,955b2543-2c2c-402a-bf3c-a6ee3af600c4,Update all dependencies,46
1,3b294b94-360d-4434-a60c-2ba597487716,Implement project sharing,77
2,37efb8eb-0be0-4b7c-a8b2-fa5a64a24294,Avoid flash of default content when loading project,62
3,17eebd97-1408-4ef8-a09f-5dd6274b6a70,circular-progress,122
4,86693605-0eb8-4f63-b415-bf98daff8dd9,chip,-1


In [49]:
sdf["cluster"].value_counts()

cluster
-1      209
 46      16
 75       8
 116      6
 117      6
 10       6
 143      5
 89       5
 5        5
 22       5
 24       5
 118      5
 90       5
 146      5
 124      5
 125      5
 98       4
 36       4
 32       4
 128      4
 13       4
 54       4
 51       4
 107      4
 113      4
 80       4
 43       4
 93       4
 114      4
 37       4
 30       4
 147      3
 141      3
 84       3
 77       3
 65       3
 71       3
 148      3
 74       3
 15       3
 21       3
 63       3
 108      3
 11       3
 87       3
 58       3
 81       3
 49       3
 68       3
 2        3
 133      3
 121      3
 131      3
 104      3
 50       3
 12       3
 78       3
 140      3
 59       3
 92       3
 55       3
 7        3
 70       3
 106      3
 126      3
 95       3
 33       2
 31       2
 23       2
 94       2
 149      2
 120      2
 82       2
 112      2
 103      2
 134      2
 129      2
 119      2
 53       2
 9        2
 138      2
 6        2
 97     

In [51]:
sdf[sdf["cluster"] == -1].drop(columns=["embedding"]).head(20)

Unnamed: 0,id,name,cluster
4,86693605-0eb8-4f63-b415-bf98daff8dd9,chip,-1
7,86a5f6d0-2e61-43f0-9e0e-42b0ccefc96a,Improve selection handling,-1
10,0f97e324-02c5-4e39-88e8-01fbfb989899,Bump openssl from 0.10.68 to 0.10.70 in the cargo group across 1 directory,-1
13,0bb22800-94e1-48bf-8933-b371f3f578ef,Add Koso task number to GitHub PR to link PR to Koso task (koso#<ID>),-1
16,5e0da712-b0f0-4515-8ae0-e9da8dd04004,sonner,-1
18,cf7ae1da-e154-497d-9c5a-e25c3eb82909,Make sure teloxide always shuts down,-1
21,e22a2e9f-d491-4f5d-8ae0-71e10a2da0c4,Migrate search for tasks component to Kosui,-1
30,32621375-c3b8-4e53-9ee0-f7bf25393001,Tooltips for what will happen when you drag and drop,-1
32,8930a2f2-ebc9-4df3-958b-acf6dacd27da,"Revert ""Update all dependencies""",-1
34,11ac1aae-0062-48ca-a22f-753a70b5ca42,Update Zero icons,-1


In [52]:
sdf["cluster"].nunique()

151

In [76]:
similarities = []
for _, row in sdf.iterrows():
    cluster = row["cluster"]
    if cluster == -1:
        similarities.append(None)
    else:
        centroid = model.centroids_[cluster]
        embedding = row["embedding"]
        similarity = cosine_similarity([centroid], [row["embedding"]])
        # The embedding is in a square bracket of a squarebracket, so need to index it by [0][0]
        similarities.append(similarity[0][0])
sdf["similarity"] = similarities

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret 

In [74]:
cosine_similarity([centroid], [np.array(row["embedding"])])

  ret = a @ b
  ret = a @ b
  ret = a @ b


array([[0.93121516]])

In [75]:
centroid @ np.array(row["embedding"])

  centroid @ np.array(row["embedding"])
  centroid @ np.array(row["embedding"])
  centroid @ np.array(row["embedding"])


np.float64(0.8437061174133655)

In [91]:
jdf = sdf.drop(columns=["embedding"]).merge(kdf[["id", "num", "status", "kind"]], on="id")
jdf.head()

Unnamed: 0,id,name,cluster,similarity,num,status,kind
0,955b2543-2c2c-402a-bf3c-a6ee3af600c4,Update all dependencies,46,1.0,362,Done,github_pr
1,3b294b94-360d-4434-a60c-2ba597487716,Implement project sharing,77,0.85318,4,Done,
2,37efb8eb-0be0-4b7c-a8b2-fa5a64a24294,Avoid flash of default content when loading project,62,0.887527,218,Done,
3,17eebd97-1408-4ef8-a09f-5dd6274b6a70,circular-progress,122,0.972294,169,Done,
4,86693605-0eb8-4f63-b415-bf98daff8dd9,chip,-1,,173,Done,


In [90]:
jdf.groupby("cluster")["similarity"].min().min()

np.float64(0.5160755223357039)

In [None]:
jdf.to_csv("koso-for-deduping.csv")

In [98]:
jdf[jdf["cluster"] == 82]

Unnamed: 0,id,name,cluster,similarity,num,status,kind
232,4c634786-3fbe-4a61-8a8a-c308fcb2041e,Split Dialog component into Dialog and Modal,82,0.951613,490,Done,github_pr
616,909d9649-6587-4bb1-b520-e31130040052,Split dialogs into Dialoguer provider and Dialog component,82,0.951613,432,Done,github_pr


In [None]:
jdf[(jdf["cluster"] != -1) & (jdf["similarity"] > .93)].sort_values(by=["cluster", "similarity"], ascending=[True, False])

Unnamed: 0,id,name,cluster,similarity,num,status,kind
499,11f59951-2c98-41c1-94b4-a724d265e2b7,Client poller,9,0.947633,271,Done,
531,067214a4-591b-4abb-bb36-2cf629a4c828,Implement Poller,9,0.947633,310,Done,
277,5ddc7c74-77e8-42a7-8909-1b68992012c4,Lock file maintenance,11,1.0,421,Done,github_pr
339,61e38ea1-b6cf-4a48-b749-6bc7c0b9fe2c,Lock file maintenance,11,1.0,367,Done,github_pr
395,5c21fbe7-8b76-4a9d-b92e-a00b56ab4f2a,Lock file maintenance,11,1.0,548,Done,github_pr
170,660e1b42-a0c4-4023-b2e5-8137a9ce244f,Log error messages on healthz check failure,13,0.959316,370,Done,github_pr
376,cec9be3e-1694-4e6c-9d98-71614c0a0c8f,Log even more error messages on healthz check failure,13,0.948965,371,Done,github_pr
96,cb826f37-3c2b-47f1-8335-8d3d7673ed7d,Heartbeat monitor server,14,0.980361,255,In Progress,
288,38ff107a-2c37-41d4-b01d-11056ab4d84f,Heartbeat monitor client,14,0.980361,269,,
597,2bc1f6c7-a9d3-4d46-a6b9-720aa404334d,Change 'insert' label to 'add',18,0.953071,473,Done,github_pr
