In [6]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
import json 
import hashlib

In [4]:
#load raw task data from JSON 
try: 
    with open('koso-dogfood-export-2025-7-5-12-2.json', 'r') as fp: 
        koso_raw_data = json.load(fp)
        #convert graph to dictionary values in DataFrame

    PROJECT_ID = 'CZVDD94wT5KrFwAv1hhejg' 
   
    if koso_raw_data.get('projectId') == PROJECT_ID: 
        tasks_df = pd.DataFrame(koso_raw_data['graph'].values())
        print(f'{len(tasks_df)} tasks have been successfully loaded from the DataFrame')


        tasks_df = tasks_df.dropna(subset=['id', 'name']).reset_index(drop=True)
        print(f'Filtered to {len(tasks_df)} tasks with valid IDs and names')

    else: 
        print(f'Skipping file: Project ID "{koso_raw_data.get('project_id')}" does not match target ID "{PROJECT_ID}".')
        tasks_df = pd.DataFrame()
except FileNotFoundError: 
    print("Error: File not found. Please ensure it's in the same directory")
    exit()
except json.JSONDecodeError: 
    print('Error: Could not decode JSON. Check file format')
    exit()
except KeyError: 
    print("Error: 'graph' key not found in the JSON data. Ensure JSON structure is as expected.")
    exit()

1180 tasks have been successfully loaded from the DataFrame
Filtered to 1180 tasks with valid IDs and names


In [8]:
#practice embeddings to run code 
def create_mock_embedding(text, size=10): 
    #using a hash to create a unique, consistent 'embedding'
    hash_value = int(hashlib.sha256(text.encode('utf-8')).hexdigest(), 16)
    #creating a fake vectore based on the hash 
    return np.random.rand(size) * (hash_value % 1000) / 1000.0

print('Generating mock embeddings and clustering data...')

sdf = tasks_df[['id', 'name']].copy()
sdf['embedding'] = sdf['name'].apply(lambda x: create_mock_embedding)

Generating mock embeddings and clustering data...


KeyError: "None of [Index(['id', 'name'], dtype='object')] are in the [columns]"

In [10]:
#Create mock cluster IDs and similarities

unique_names = tasks_df['name'].unique()
name_to_cluster = {name: i % 5 for i, name in enumerate(unique_names)} #assings 5 mock cluster
name_to_similarity = {name: 0.85 + (int(hashlib.sha256(name.encode('utf-8')).hexdigest()[-2:], 16) / 255) * 0.14 for name in unique_names} # Simulate varying similarities

jdf = tasks_df[['id', 'name']].copy()
jdf['cluster'] = jdf['name'].map(name_to_cluster)
jdf['similarity'] = jdf['name'].map(name_to_similarity)

print('Mock data generation complete. Procceeding with deduplication')

Mock data generation complete. Procceeding with deduplication


In [11]:
#define similarity thresholds
AUTO_APPROVE_THRESHOLD = 0.95

PROMPT_THRESHOLD =  0.85

#list to store al potiental duplicate pairs that will be sent to the frontend
potiental_duplicate_pairs = []

#set to keep track of pairs already checked to avoid redundant comparisons 
checked_pairs = set()

print('Identifying potiental duplicate pairs for frontend presentation...')

Identifying potiental duplicate pairs for frontend presentation...


In [12]:
#group tasks by assigend cluster

for cluster_id, group in jdf.groupby('cluster'): 
    if cluster_id == -1: 
        continue #skips outlier

    group_records = group.sort_values(by= 'similarity', ascending=False).to_dict('records')

    #compare every unique pair of tasks within the current cluster 
    for i in range(len(group_records)): 
        for j in range(i + 1, len(group_records)): 
            task1_id = group_records[i]['id']
            task2_id = group_records[j]['id']
            task1_name = group_records[i]['name']
            task2_name = group_records[j]['name']

        #this makes (1, 2) same as (2, 1), etc
            pair_key = tuple(sorted([task1_name, task2_name]))


            if pair_key in checked_pairs: 
                continue 
            checked_pairs.add(pair_key) #mark pair as checked 


            if task1_id not in sdf['id'].values or task2_id not in sdf['id'].values:
                print(f'Warning: Missing embedding for ID {task1_id} or {task2_id}. Skipping Pair.')
                continue

            #retrieves embeddings for the two tasks directly from 'sdf'
            emb1_raw = sdf.loc[sdf['id'] == task1_id, 'embedding'].values[0]
            emb2_raw = sdf.loc[sdf['id'] == task2_id, 'embedding'].values[0]

            if not isinstance(emb1_raw, np.ndarray) or not isinstance (emb2_raw, np.ndarray):
                print(f"Skipping pair due to non-ndarray embedding type for: '{task1_name}' ({task1_id}) and '{task2_name}' ({task2_id}).")
                continue 

            emb1 = emb1_raw
            emb2 = emb2_raw

        
            #calculate cosine similarity between two emebddings
            sim = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]


            if sim >= PROMPT_THRESHOLD: 
                potiental_duplicate_pairs.append({
                    'task1_id': task1_id, 
                    'task1_name': task1_name, 
                    'task2_id': task2_id, 
                    'task2_name': task2_name, 
                    'similarity': sim, 
                    'auto_approve_candidate': sim >= AUTO_APPROVE_THRESHOLD
                })



NameError: name 'jdf' is not defined

In [22]:
#converts list of potiental dupes into pandas DataFrame 
potiental_duplicate_pairs_df = pd.DataFrame(potiental_duplicate_pairs)

if not potiental_duplicate_pairs_df.empty: 
    potiental_duplicate_pairs_df = potiental_duplicate_pairs_df.sort_values(
        by='similarity', ascending=False
    ).reset_index(drop=True)

    print("\nSuccessfully identified potential duplicate pairs.")
    print(potiental_duplicate_pairs_df)


    #output for frontend 
    output_columns = ['task1_id', 'task1_name', 'task2_id', 'task2_name', 'similarity', 'auto_approve_candidate']
    csv_output_df = potiental_duplicate_pairs_df[output_columns]

    # Save the DataFrame to a CSV file
    csv_file_path = 'potential_duplicates.csv'
    csv_output_df.to_csv(csv_file_path, index=False)
    print(f"\nPotential duplicate pairs saved to {csv_file_path}")
else:
    print("\nNo potential duplicate pairs found above the PROMPT_THRESHOLD.")




No potential duplicate pairs found above the PROMPT_THRESHOLD.
