In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/gtr-t5-large")

# 1. Prepare data

I will prototype this approach by extracting the relations between a subset of the Open Targets data. The subset will be the following:
- Molecules. 
- Drug indications.
- Drug mechanism of actions.
- Diseases.

I am going to evaluate the model by the ability to find these relationships:
- `drug_df.linkedDiseases` is a list of `disease_df.id` values.
- `drug_df.linkedTargets` is a list of `target_df.id` values.
- `indication_df.approvedIndications` is a list of `disease_df.id` values.
- `indication_df.id` is a list of `drug_df.id` values.
- `moa_df.targets` is a list of `target_df.id` values.
- `moa_df.chemblIds` is a list of `drug_df.id` values.

In [None]:
# Download data from 22.11 release
# !wget --recursive --no-parent --no-host-directories --cut-dirs 8 ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.11/output/etl/parquet/molecule --accept "*.parquet" --directory-prefix data
# !wget --recursive --no-parent --no-host-directories --cut-dirs 8 ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.11/output/etl/parquet/indication --accept "*.parquet" --directory-prefix data
# !wget --recursive --no-parent --no-host-directories --cut-dirs 8 ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.11/output/etl/parquet/mechanismOfAction --accept "*.parquet" --directory-prefix data
# !wget --recursive --no-parent --no-host-directories --cut-dirs 8 ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.11/output/etl/parquet/diseases --accept "*.parquet" --directory-prefix data
# !wget --recursive --no-parent --no-host-directories --cut-dirs 8 ftp://ftp.ebi.ac.uk/pub/databases/opentargets/platform/22.11/output/etl/parquet/targets --accept "*.parquet" --directory-prefix data

In [23]:
import os
import pandas as pd

drug_df = pd.concat([pd.read_parquet(f"data/molecule/{f}") for f in os.listdir("data/molecule")])
indication_df = pd.concat([pd.read_parquet(f"data/indication/{f}") for f in os.listdir("data/indication")])
moa_df = pd.concat([pd.read_parquet(f"data/mechanismOfAction/{f}") for f in os.listdir("data/mechanismOfAction")])
disease_df = pd.concat([pd.read_parquet(f"data/diseases/{f}") for f in os.listdir("data/diseases")])
target_df = pd.concat([pd.read_parquet(f"data/targets/{f}") for f in os.listdir("data/targets")])

In [47]:
for df in [drug_df, indication_df, moa_df, disease_df, target_df]:
    print(df.info())
    display(df.head(5))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12854 entries, 0 to 59
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         12854 non-null  object 
 1   canonicalSmiles            10260 non-null  object 
 2   inchiKey                   10260 non-null  object 
 3   drugType                   12845 non-null  object 
 5   name                       12854 non-null  object 
 6   yearOfFirstApproval        3103 non-null   float64
 7   maximumClinicalTrialPhase  12845 non-null  float64
 8   parentId                   1451 non-null   object 
 9   hasBeenWithdrawn           12845 non-null  object 
 10  isApproved                 12845 non-null  object 
 11  withdrawnNotice            232 non-null    object 
 12  tradeNames                 12854 non-null  object 
 13  synonyms                   12854 non-null  object 
 14  crossReferences            9338 non-null   object

Unnamed: 0,id,canonicalSmiles,inchiKey,drugType,blackBoxWarning,name,yearOfFirstApproval,maximumClinicalTrialPhase,parentId,hasBeenWithdrawn,isApproved,withdrawnNotice,tradeNames,synonyms,crossReferences,childChemblIds,linkedDiseases,linkedTargets,description
0,CHEMBL1096896,CCNC(=O)[C@H]1O[C@@H](n2cnc3c(NCC(c4ccccc4)c4c...,ZOTHAEBAWXWVID-HXEFRTELSA-N,Small molecule,False,UK432097,,2.0,,False,False,,[],[Uk432097],"[(drugbank, [DB12691])]",,"{'rows': ['EFO_0000341'], 'count': 1}","{'rows': ['ENSG00000128271'], 'count': 1}",Small molecule drug with a maximum clinical tr...
1,CHEMBL1200698,,,Small molecule,False,PENTETATE CALCIUM TRISODIUM YB 169,1976.0,4.0,CHEMBL2110560,False,True,,[Ytterbium yb 169 dtpa],"[169yb, COMPOUND 24266, MATERIAL A, MATERIAL-A...",,,,"{'rows': [], 'count': 0}",Small molecule drug with a maximum clinical tr...
2,CHEMBL1200747,CC(O)C(=O)O.N,RZOBLYBZQXQGFY-UHFFFAOYSA-N,Small molecule,False,AMMONIUM LACTATE,1985.0,4.0,,False,True,,"[Ammonium lactate, Lac-hydrin]","[Ammonium lactate, BMS-186091, E328, Lac hydri...","[(DailyMed, [ammonium%20lactate]), (DrugCentra...",,,"{'rows': [], 'count': 0}",Small molecule drug with a maximum clinical tr...
3,CHEMBL1200869,Cc1ncc([N+](=O)[O-])n1CCO.Cl,FPTPAIQTXYFGJC-UHFFFAOYSA-N,Small molecule,False,METRONIDAZOLE HYDROCHLORIDE,1980.0,4.0,CHEMBL137,False,True,,"[Flagyl, Flagyl i.v., Metronidazole hydrochlor...","[Metronidazole hcl, Metronidazole hydrochlorid...","[(chEBI, [50687])]",,"{'rows': ['EFO_0000574', 'EFO_0003102', 'EFO_0...","{'rows': [], 'count': 0}",Small molecule drug with a maximum clinical tr...
4,CHEMBL1201190,Cl.Cl.O=C(O)COCCN1CCN([C@H](c2ccccc2)c2ccc(Cl)...,PGLIUCLTXOYQMV-GHVWMZMZSA-N,Small molecule,False,LEVOCETIRIZINE DIHYDROCHLORIDE,2007.0,4.0,CHEMBL1201191,False,True,,"[Levocetirizine dihydrochloride, Xusal, Xyzal,...","[Cetirizine (r)-form dihydrochloride, Levoceti...","[(DailyMed, [levocetirizine%20dihydrochloride])]",,"{'rows': ['EFO_0007141', 'HP_0012735', 'HP_000...","{'rows': ['ENSG00000196639'], 'count': 1}",Small molecule drug with a maximum clinical tr...


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8334 entries, 0 to 27
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   8334 non-null   object
 1   indications          8334 non-null   object
 2   approvedIndications  8334 non-null   object
 3   indicationCount      8334 non-null   int32 
dtypes: int32(1), object(3)
memory usage: 293.0+ KB
None


Unnamed: 0,id,indications,approvedIndications,indicationCount
0,CHEMBL2146121,"[{'disease': 'EFO_0000565', 'efoName': 'leukem...",[],42
1,CHEMBL2108738,"[{'disease': 'EFO_0005922', 'efoName': 'esopha...","[MONDO_0009348, EFO_0000640, EFO_0000616, EFO_...",138
2,CHEMBL708,"[{'disease': 'MONDO_0005090', 'efoName': 'schi...","[MONDO_0005090, EFO_0004269, EFO_0005306, MOND...",23
3,CHEMBL1863514,"[{'disease': 'EFO_0000220', 'efoName': 'acute ...","[MONDO_0000873, EFO_0000220]",5
4,CHEMBL13209,"[{'disease': 'MONDO_0002050', 'efoName': 'depr...",[],6


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6610 entries, 0 to 36
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   actionType         6213 non-null   object
 1   mechanismOfAction  6610 non-null   object
 2   chemblIds          6610 non-null   object
 3   targetName         5849 non-null   object
 4   targetType         5849 non-null   object
 5   targets            5849 non-null   object
 6   references         6610 non-null   object
dtypes: object(7)
memory usage: 413.1+ KB
None


Unnamed: 0,actionType,mechanismOfAction,chemblIds,targetName,targetType,targets,references
0,INHIBITOR,Vanilloid receptor inhibitor,[CHEMBL3544953],Vanilloid receptor,single protein,[ENSG00000196689],"[{'source': 'Other', 'ids': ['http://www.resea..."
1,ANTAGONIST,Serotonin 2a (5-HT2a) receptor antagonist,"[CHEMBL1200916, CHEMBL479]",Serotonin 2a (5-HT2a) receptor,single protein,[ENSG00000102468],"[{'source': 'ISBN', 'ids': ['0443-059748 PP. 5..."
2,INHIBITOR,Polymerase acidic protein inhibitor,[CHEMBL4297215],Polymerase acidic protein,single protein,[],"[{'source': 'FDA', 'ids': ['https://www.access..."
3,INHIBITOR,Cyclooxygenase inhibitor,[CHEMBL521],Cyclooxygenase,protein family,"[ENSG00000073756, ENSG00000095303]","[{'source': 'ISBN', 'ids': ['0443-059748 PP. 2..."
4,,Unknown,[CHEMBL1334860],,,,"[{'source': 'PubMed', 'ids': ['12397207', '149..."


<class 'pandas.core.frame.DataFrame'>
Int64Index: 22274 entries, 0 to 104
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   22274 non-null  object
 1   code                 22274 non-null  object
 2   dbXRefs              22274 non-null  object
 3   description          18226 non-null  object
 4   name                 22274 non-null  object
 5   directLocationIds    12 non-null     object
 6   obsoleteTerms        4265 non-null   object
 7   parents              22274 non-null  object
 8   sko                  1 non-null      object
 9   synonyms             14593 non-null  object
 10  ancestors            22274 non-null  object
 11  descendants          22274 non-null  object
 12  children             22274 non-null  object
 13  therapeuticAreas     22274 non-null  object
 14  indirectLocationIds  34 non-null     object
 15  ontology             22274 non-null  object
dtypes: obj

Unnamed: 0,id,code,dbXRefs,description,name,directLocationIds,obsoleteTerms,parents,sko,synonyms,ancestors,descendants,children,therapeuticAreas,indirectLocationIds,ontology
0,EFO_0000224,http://www.ebi.ac.uk/efo/EFO_0000224,"[NCIT:C3182, MONDO:0012883, UMLS:C0023487, EFO...",Acute promyelocytic leukemia (APL) is an aggre...,acute promyelocytic leukemia,,[EFO_0000202],[EFO_0000222],,"{'hasBroadSynonym': None, 'hasExactSynonym': [...","[MONDO_0004643, EFO_0004260, OTAR_0000006, EFO...",[],[],"[OTAR_0000006, EFO_0005803, MONDO_0045024, EFO...",,"{'isTherapeuticArea': False, 'leaf': True, 'so..."
1,EFO_0000384,http://www.ebi.ac.uk/efo/EFO_0000384,"[MedDRA:10011401, MONDO:0005011, NCIt:C27837, ...",A chronic transmural inflammation that may inv...,Crohn's disease,,,[EFO_0003767],,"{'hasBroadSynonym': None, 'hasExactSynonym': [...","[EFO_0005140, EFO_0009431, OTAR_0000018, EFO_0...","[MONDO_0000709, EFO_0005622, EFO_0005624, EFO_...","[EFO_0005622, EFO_0005625, EFO_0005627, EFO_00...","[OTAR_0000018, EFO_0000540, EFO_0010282]",,"{'isTherapeuticArea': False, 'leaf': False, 's..."
2,EFO_0001421,http://www.ebi.ac.uk/efo/EFO_0001421,"[DOID:409, UMLS:C0023895, NCIt:C3196, ICD9:573...",Any disease or dysfunction of the liver and th...,liver disease,,,"[EFO_0001379, EFO_0010284]",,"{'hasBroadSynonym': None, 'hasExactSynonym': [...","[EFO_0001379, EFO_0010284, EFO_0010282]","[MONDO_0008967, MONDO_0008966, MONDO_0000447, ...","[MONDO_0000447, MONDO_0002405, MONDO_0002520, ...","[EFO_0001379, EFO_0010282]",,"{'isTherapeuticArea': False, 'leaf': False, 's..."
3,EFO_0004328,http://www.ebi.ac.uk/efo/EFO_0004328,"[MeSH:D005080, MedDRA:10015652]","Controlled physical activity, more strenuous t...",exercise test,,,[EFO_0004311],,"{'hasBroadSynonym': None, 'hasExactSynonym': [...","[EFO_0001444, EFO_0005278, EFO_0004311, EFO_00...",[],[],[EFO_0001444],,"{'isTherapeuticArea': False, 'leaf': True, 'so..."
4,EFO_0004554,http://www.ebi.ac.uk/efo/EFO_0004554,[],Is a quantification of some aspect of the geno...,genomic measurement,,,[EFO_0001444],,,[EFO_0001444],"[EFO_0009861, EFO_0009860, EFO_0007783, EFO_00...","[EFO_0000513, EFO_0004505, EFO_0004798, EFO_00...",[EFO_0001444],,"{'isTherapeuticArea': False, 'leaf': False, 's..."


<class 'pandas.core.frame.DataFrame'>
Int64Index: 62678 entries, 0 to 332
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    62678 non-null  object
 1   approvedSymbol        62678 non-null  object
 2   biotype               62678 non-null  object
 3   transcriptIds         62678 non-null  object
 4   canonicalTranscript   61418 non-null  object
 5   canonicalExons        61381 non-null  object
 6   genomicLocation       62678 non-null  object
 7   alternativeGenes      1086 non-null   object
 8   approvedName          62678 non-null  object
 9   go                    20904 non-null  object
 10  hallmarks             351 non-null    object
 11  synonyms              62678 non-null  object
 12  symbolSynonyms        62678 non-null  object
 13  nameSynonyms          62678 non-null  object
 14  functionDescriptions  20135 non-null  object
 15  subcellularLocations  20135 non-null  

Unnamed: 0,id,approvedSymbol,biotype,transcriptIds,canonicalTranscript,canonicalExons,genomicLocation,alternativeGenes,approvedName,go,...,obsoleteNames,constraint,tep,proteinIds,dbXrefs,chemicalProbes,homologues,tractability,safetyLiabilities,pathways
0,ENSG00000011451,WIZ,protein_coding,"[ENST00000389282, ENST00000596159, ENST0000026...","{'id': 'ENST00000673675', 'chromosome': '19', ...","[15436806, 15437129, 15426982, 15427533, 15425...","{'chromosome': '19', 'start': 15419978, 'end':...",,WIZ zinc finger,"[{'id': 'GO:0006357', 'source': 'PMID:21873635...",...,[{'label': 'widely interspaced zinc finger mot...,"[{'constraintType': 'syn', 'score': 0.31986999...",,"[{'id': 'O95785', 'source': 'uniprot_swissprot...","[{'id': '30917', 'source': 'HGNC'}, {'id': 'IP...",,"[{'speciesId': '9606', 'speciesName': 'Human',...","[{'modality': 'SM', 'id': 'Approved Drug', 'va...",,
1,ENSG00000023902,PLEKHO1,protein_coding,"[ENST00000485470, ENST00000492304, ENST0000060...","{'id': 'ENST00000369124', 'chromosome': '1', '...","[150158819, 150160065, 150149916, 150150287, 1...","{'chromosome': '1', 'start': 150149183, 'end':...",,pleckstrin homology domain containing O1,"[{'id': 'GO:0007520', 'source': 'GO_REF:000010...",...,[{'label': 'pleckstrin homology domain contain...,"[{'constraintType': 'syn', 'score': -0.2883000...",,"[{'id': 'Q53GL0', 'source': 'uniprot_swissprot...","[{'id': '24310', 'source': 'HGNC'}, {'id': '3A...",,"[{'speciesId': '9606', 'speciesName': 'Human',...","[{'modality': 'SM', 'id': 'Approved Drug', 'va...",,
2,ENSG00000035862,TIMP2,protein_coding,"[ENST00000536189, ENST00000586057, ENST0000058...","{'id': 'ENST00000262768', 'chromosome': '17', ...","[78857522, 78857646, 78924959, 78925387, 78852...","{'chromosome': '17', 'start': 78852977, 'end':...",,TIMP metallopeptidase inhibitor 2,"[{'id': 'GO:0005576', 'source': 'Reactome:R-HS...",...,[{'label': 'tissue inhibitor of metalloprotein...,"[{'constraintType': 'syn', 'score': -0.0323360...",,"[{'id': 'P16035', 'source': 'uniprot_swissprot...","[{'id': '11821', 'source': 'HGNC'}, {'id': '1B...",,"[{'speciesId': '9606', 'speciesName': 'Human',...","[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of gene expression', 'e...","[{'pathwayId': 'R-HSA-1592389', 'pathway': 'Ac..."
3,ENSG00000042813,ZPBP,protein_coding,"[ENST00000046087, ENST00000465922, ENST0000041...","{'id': 'ENST00000046087', 'chromosome': '7', '...","[50018240, 50018316, 50031092, 50031310, 50057...","{'chromosome': '7', 'start': 49850421, 'end': ...",,zona pellucida binding protein,"[{'id': 'GO:0001669', 'source': 'PMID:21873635...",...,[],"[{'constraintType': 'syn', 'score': -1.2680000...",,"[{'id': 'Q9BS86', 'source': 'uniprot_swissprot...","[{'id': '15662', 'source': 'HGNC'}, {'id': 'IP...",,"[{'speciesId': '9606', 'speciesName': 'Human',...","[{'modality': 'SM', 'id': 'Approved Drug', 'va...",,
4,ENSG00000065457,ADAT1,protein_coding,"[ENST00000568001, ENST00000568510, ENST0000030...","{'id': 'ENST00000564657', 'chromosome': '16', ...","[75622403, 75623281, 75620631, 75620820, 75620...","{'chromosome': '16', 'start': 75596868, 'end':...",,adenosine deaminase tRNA specific 1,"[{'id': 'GO:0003723', 'source': 'PMID:10430867...",...,[],"[{'constraintType': 'syn', 'score': -1.4117000...",,"[{'id': 'Q9BUB4', 'source': 'uniprot_swissprot...","[{'id': '228', 'source': 'HGNC'}, {'id': 'R-HS...",,"[{'speciesId': '9606', 'speciesName': 'Human',...","[{'modality': 'SM', 'id': 'Approved Drug', 'va...",,"[{'pathwayId': 'R-HSA-6782315', 'pathway': 'tR..."


### 1.2 Subset data

I want to limit the exercise to a small amount of data to test that the model is functional without having to embed the whole OT universe.
To do so:
- I will select a random sample of 100 rows per dataset.
- I will use records that are as complete as possible (no nulls in any of the columns)
  - This will not always be possible (drug_df, disease_df, target_df) have at least one null value.

In [72]:
def subset_shuffle_data(df: pd.DataFrame) -> pd.DataFrame:
    if len(df.dropna(axis=0, how="any")) >= 100:
        return df.dropna(axis=0, how="any").sample(100).reset_index(drop=True)
    return df.sample(100).reset_index(drop=True)

drug_subset_df = subset_shuffle_data(drug_df)
indication_subset_df = subset_shuffle_data(indication_df)
moa_subset_df = subset_shuffle_data(moa_df)
disease_subset_df = subset_shuffle_data(disease_df)
target_subset_df = subset_shuffle_data(target_df)

    

In [54]:
moa_subset_df.head(3)

Unnamed: 0,actionType,mechanismOfAction,chemblIds,targetName,targetType,targets,references
0,INHIBITOR,Tubulin inhibitor,[CHEMBL3833306],Tubulin,protein complex group,"[ENSG00000188229, ENSG00000101162, ENSG0000019...","[{'source': 'PubMed', 'ids': ['27518442', '283..."
1,ANTAGONIST,Dopamine D2 receptor antagonist,[CHEMBL28218],Dopamine D2 receptor,single protein,[ENSG00000149295],"[{'source': 'PubMed', 'ids': ['17362435', '215..."
2,AGONIST,Glucocorticoid receptor agonist,[CHEMBL1549],Glucocorticoid receptor,single protein,[ENSG00000113580],"[{'source': 'Wikipedia', 'ids': ['Glucocortico..."


## 2. Generate embeddings


In [34]:
model.encode(["This is a test", "This is another test"])

# nice, i get a vector of size 768 (same parameters as the T5 model)

array([[ 0.02768615, -0.04751104, -0.01583998, ..., -0.0343785 ,
         0.0130953 ,  0.0593827 ],
       [ 0.00734502, -0.0574134 , -0.00358513, ..., -0.04301181,
         0.01122494,  0.08128042]], dtype=float32)

I have 2 ideas to query the model:

1. Provide all the data in the prompt and the question I want to ask. For this I don't know if I have to be explicit every time (e.g. in which columns I have drug IDs?) or if I can just ask for all the relationships it can find. This is kind of a zero shot learning approach.

2. Index each column and then ask the model to find the relationships between them by using cosine similarities. This has the drawback of having to be explicit.

Let's go for the harder one first (#1)
### 2.1 Provide all data in the prompt by converting the tables to text

In [84]:
# WIP: I have been testing this solution with Cohere's API without great results, I am still trying to figure out what is the best prompt.

"""
Sample output (prompt was one record of each datasets, question was "Give me all columns in each dataframe that contain drug IDs to draw relationships between the data:"):
To draw relationships between the data, you can use the join() function. For example, you can join the Drug_df and Indications_df dataframes on the drug ID column to get a dataframe with the combined columns.

# Create a new dataframe with the joined columns
drug_indications = pd.join(Drug_df, Indications_df, on='drug_id')

This will give you a dataframe with the combined columns, including the drug ID column from both dataframes. You can then use this dataframe to draw relationships between the data.
"""

# However, for the same promt ChatGPT gave me almost exactly what I wanted (I even hadn't realised there are drug IDs in crossReferences too!):
"""
Based on the information provided, it seems that the common ID between the dataframes is the CHEMBL ID. Here are the columns in each dataframe that contain drug IDs:

Drug_df Dataframe:

id
parentId
childChemblIds
crossReferences
Indications_df dataframe:

id
indications <--- this is wrong
Mechanism_of_action_df Dataframe:

chemblIds

"""

'\nBased on the information provided, it seems that the common ID between the dataframes is the CHEMBL ID. Here are the columns in each dataframe that contain drug IDs:\n\nDrug_df Dataframe:\n\nid\nparentId\nchildChemblIds\ncrossReferences\nIndications_df dataframe:\n\nid\nindications <--- this is wrong\nMechanism_of_action_df Dataframe:\n\nchemblIds\n\n'

### 2.2 Vectorise each column of each dataset and compute similarities

My dictionary will have three keys:

- id: following the pattern "{dataframe_name}_{column_name}"
- content: each value of the column in a single string format
- embedding: the embedding of the content

In [92]:
test_df = indication_subset_df.copy()
test_df.columns = [f"test_{col}" for col in test_df.columns]

result_dict = {col: test_df[col].astype(str).tolist() for col in test_df.columns}
for k, v in result_dict.items():
    result_dict[k] = str(v)

print(result_dict.keys())
print(result_dict["test_id"])
# This is what I want, but i need to review the array columns, converting them into strings introduces weird characters

dict_keys(['test_id', 'test_indications', 'test_approvedIndications', 'test_indicationCount'])
['CHEMBL2109540', 'CHEMBL4297812', 'CHEMBL2109322', 'CHEMBL4297809', 'CHEMBL4594404', 'CHEMBL2105665', 'CHEMBL2108327', 'CHEMBL577711', 'CHEMBL4297444', 'CHEMBL1643', 'CHEMBL1493', 'CHEMBL3707289', 'CHEMBL4299940', 'CHEMBL2392545', 'CHEMBL4594547', 'CHEMBL1214192', 'CHEMBL1241348', 'CHEMBL218427', 'CHEMBL65794', 'CHEMBL2151437', 'CHEMBL1087', 'CHEMBL2108791', 'CHEMBL211538', 'CHEMBL2109249', 'CHEMBL1457', 'CHEMBL1082508', 'CHEMBL516', 'CHEMBL3039550', 'CHEMBL1741078', 'CHEMBL494397', 'CHEMBL1201630', 'CHEMBL1743260', 'CHEMBL1201200', 'CHEMBL294199', 'CHEMBL2110603', 'CHEMBL2107884', 'CHEMBL2109546', 'CHEMBL2165224', 'CHEMBL4297283', 'CHEMBL2111110', 'CHEMBL2108308', 'CHEMBL37676', 'CHEMBL294951', 'CHEMBL160', 'CHEMBL12610', 'CHEMBL4650349', 'CHEMBL113313', 'CHEMBL334966', 'CHEMBL1472989', 'CHEMBL647', 'CHEMBL2220486', 'CHEMBL1086440', 'CHEMBL3545013', 'CHEMBL2109641', 'CHEMBL3545298', 'CHEMBL

In [88]:
def populate_embeddings_dict(df: pd.DataFrame, embeddings_dict: dict) -> dict:
    # add prefix to column name
    df.columns = [f"test_{col}" for col in test_df.columns]
    result_dict = {col: test_df[col].astype(str).tolist() for col in test_df.columns}
    for k, v in result_dict.items():
        result_dict[k] = str(v)
    embeddings_dict |= result_dict
    return embeddings_dict

dict_keys(['test_id', 'test_indications', 'test_approvedIndications', 'test_indicationCount'])