# Similarities between combinations

## 1. Data import

In [7]:
import pandas as pd
import numpy as np

In [8]:
# Load niaid queries
queries_df = pd.read_csv('./niaid_queries.csv', sep=',')
queries_df

Unnamed: 0,combination,search term,result id
0,original query,influenza,PRJNA658522
1,original query,influenza,PRJNA658529
2,original query,influenza,PRJNA658552
3,original query,influenza,PRJNA658564
4,original query,influenza,PRJNA658562
...,...,...,...
3995,c15,hiv,VIVLI_02021d26-0e96-424c-8fec-77cbd204fd8d
3996,c15,hiv,VIVLI_7a8a7234-ada3-465b-a608-347b5e74a542
3997,c15,hiv,VIVLI_7ac8687d-79c8-4860-8141-83776849e2df
3998,c15,hiv,VIVLI_242c4b5a-19b3-4a51-b885-05358ed89db6


In [9]:
queries_df['search term'].unique()

array(['influenza', 'malaria therapeutics', 'long covid',
       'zika microcephaly', 'naegleria fowleri infection', 'asthma',
       'allergy treatment', 'allergen skin prick test',
       'sublingual immunotherapy', 'AIDS', 't-cell function',
       'immunotherapeutics', "addison's disease", 'cancer',
       'myocardial infarction', 'rational cancer drug design',
       'dendritic cells', 'mast cells', 'plasmacytoid dendritic cells',
       'pinealocyte', 'metabolomics', 'gwas', 'tuberculin skin test',
       'mycobacterium', 'hiv'], dtype=object)

## 2. Data wrangling

In [10]:
# Remove unneeded rows
combinations_df = queries_df.drop(
                    queries_df.loc[queries_df['combination'] == 'original query'].index).copy(deep = True)
combinations_df

Unnamed: 0,combination,search term,result id
10,c1,influenza,PRJNA658522
11,c1,influenza,PRJNA658529
12,c1,influenza,PRJNA658552
13,c1,influenza,PRJNA658564
14,c1,influenza,PRJNA658562
...,...,...,...
3995,c15,hiv,VIVLI_02021d26-0e96-424c-8fec-77cbd204fd8d
3996,c15,hiv,VIVLI_7a8a7234-ada3-465b-a608-347b5e74a542
3997,c15,hiv,VIVLI_7ac8687d-79c8-4860-8141-83776849e2df
3998,c15,hiv,VIVLI_242c4b5a-19b3-4a51-b885-05358ed89db6


## 3. Similarity computation using the Jaccard index

In [11]:
# Create sets of result ids for each combination
sets = combinations_df.groupby('combination')['result id'].apply(set)

# Jaccard similarity between sets
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Calculate similarities
similarities = {}
for combi1 in sets.index:
    for combi2 in sets.index:
        if combi1 != combi2 and (combi2, combi1) not in similarities:
            similarity = jaccard_similarity(sets[combi1], sets[combi2])
            similarities[(combi1, combi2)] = similarity

print("\nJaccard similarities:")
for (combi1, combi2), similarity in similarities.items():
    print(f"Similarity between set_{combi1} and set_{combi2}: {similarity}")


Jaccard similarities:
Similarity between set_c1 and set_c10: 0.3028720626631854
Similarity between set_c1 and set_c11: 0.8587360594795539
Similarity between set_c1 and set_c12: 0.7361111111111112
Similarity between set_c1 and set_c13: 0.5625
Similarity between set_c1 and set_c14: 0.42165242165242167
Similarity between set_c1 and set_c15: 0.3351206434316354
Similarity between set_c1 and set_c2: 0.7241379310344828
Similarity between set_c1 and set_c3: 0.5290519877675841
Similarity between set_c1 and set_c4: 0.38227146814404434
Similarity between set_c1 and set_c5: 0.2794871794871795
Similarity between set_c1 and set_c6: 0.9455252918287937
Similarity between set_c1 and set_c7: 0.7421602787456446
Similarity between set_c1 and set_c8: 0.5290519877675841
Similarity between set_c1 and set_c9: 0.40168539325842695
Similarity between set_c10 and set_c11: 0.3028720626631854
Similarity between set_c10 and set_c12: 0.3633879781420765
Similarity between set_c10 and set_c13: 0.4590643274853801
Simil