# Creating a dataset for synergy score and drug metrics
Start by importing the data set and merging them

In [6]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Load the datasets
drug_info = pd.read_csv("drug_chemical_info.csv", encoding="latin1")
synergy_data = pd.read_csv("drug_combinations.csv", encoding="latin1")

# Normalize to uppercase for merging
drug_info["drugName"] = drug_info["drugName"].str.upper()
synergy_data["Drug1"] = synergy_data["Drug1"].str.upper()
synergy_data["Drug2"] = synergy_data["Drug2"].str.upper()

# Merge synergy_data with drug_info for Drug1
merged = synergy_data.merge(
    drug_info, left_on="Drug1", right_on="drugName", suffixes=("", "_Drug1")
).rename(
    columns={"drugName": "Drug1Name", "molecularWeight": "molecularWeightDrug1", "smilesString": "smilesStringDrug1"}
)

# Merge the result with drug_info for Drug2
merged = merged.merge(
    drug_info, left_on="Drug2", right_on="drugName", suffixes=("", "_Drug2")
).rename(
    columns={"drugName": "Drug2Name", "molecularWeight": "molecularWeightDrug2", "smilesString": "smilesStringDrug2"}
)

# Select and rename final columns
final_columns = [
    "Drug1Name", "Drug2Name", "drug1_db", "drug2_db", "synergy",
    "molecularWeightDrug1", "molecularWeightDrug2",
    "smilesStringDrug1", "smilesStringDrug2"
]
merged_df = merged[final_columns]
# exclude rows where the same drug is combined with itself
merged_df = merged_df[merged_df["Drug1Name"] != merged_df["Drug2Name"]]
# exclude rows whith missing values
merged_df = merged_df.dropna(subset=["smilesStringDrug1", "smilesStringDrug2"])

In [8]:
merged_df

Unnamed: 0,Drug1Name,Drug2Name,drug1_db,drug2_db,synergy,molecularWeightDrug1,molecularWeightDrug2,smilesStringDrug1,smilesStringDrug2
0,5-FU,BORTEZOMIB,DB00544,DB00188,-2.3950,130.077223,384.237200,C1=C(C(=O)NC(=O)N1)F,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...
1,5-FU,DASATINIB,DB00544,DB01254,1.5075,130.077223,488.005540,C1=C(C(=O)NC(=O)N1)F,CC1=C(C(=CC=C1)Cl)NC(=O)C2=CN=C(S2)NC3=NC(=NC(...
2,5-FU,ERLOTINIB,DB00544,DB00530,8.2525,130.077223,393.435720,C1=C(C(=O)NC(=O)N1)F,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...
3,5-FU,GELDANAMYCIN,DB00544,DB02424,6.0575,130.077223,560.635900,C1=C(C(=O)NC(=O)N1)F,CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...
4,5-FU,LAPATINIB,DB00544,DB01259,4.9200,130.077223,581.057543,C1=C(C(=O)NC(=O)N1)F,CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3...
...,...,...,...,...,...,...,...,...,...
69259,MYCOPHENOLATE MOFETIL,TOREMIFENE CITRATE,DB00688,DB00539,-2.3040,433.494740,598.083140,CC1=C(C(=C(C2=C1COC2=O)O)CC=C(C)CCC(=O)OCCN3CC...,CN(C)CCOC1=CC=C(C=C1)C(=C(CCCl)C2=CC=CC=C2)C3=...
69260,MYCOPHENOLATE MOFETIL,ARIPIPRAZOLE,DB00688,DB01238,-5.2390,433.494740,448.385380,CC1=C(C(=C(C2=C1COC2=O)O)CC=C(C)CCC(=O)OCCN3CC...,C1CC(=O)NC2=C1C=CC(=C2)OCCCCN3CCN(CC3)C4=C(C(=...
69262,SUNITINIB MALATE,TOREMIFENE CITRATE,DB01268,DB00539,1.7320,532.561223,598.083140,CCN(CC)CCNC(=O)C1=C(NC(=C1C)C=C2C3=C(C=CC(=C3)...,CN(C)CCOC1=CC=C(C=C1)C(=C(CCCl)C2=CC=CC=C2)C3=...
69263,SUNITINIB MALATE,ARIPIPRAZOLE,DB01268,DB01238,-6.5290,532.561223,448.385380,CCN(CC)CCNC(=O)C1=C(NC(=C1C)C=C2C3=C(C=CC(=C3)...,C1CC(=O)NC2=C1C=CC(=C2)OCCCCN3CCN(CC3)C4=C(C(=...


# Now lets create different datasets for different metrics

## weight

In [9]:
# Create a new DataFrame with relevant columns
weight_synergy_df = merged_df[["synergy", "molecularWeightDrug1", "molecularWeightDrug2"]].copy()

# Calculate combined weight (sum of both weights)
weight_synergy_df["combinedWeight"] = weight_synergy_df["molecularWeightDrug1"] + weight_synergy_df["molecularWeightDrug2"]

# Calculate weight difference (absolute difference between the two weights)
weight_synergy_df["weightDifference"] = abs(weight_synergy_df["molecularWeightDrug1"] - weight_synergy_df["molecularWeightDrug2"])

# Calculate a weight ratio (to balance the contribution of both weights)
weight_synergy_df["weightRatio"] = (
    weight_synergy_df["molecularWeightDrug1"] / weight_synergy_df["molecularWeightDrug2"]
).where(weight_synergy_df["molecularWeightDrug2"] != 0, None)  # Avoid division by zero

# Calculate a geometric mean of the weights (another balanced metric)
weight_synergy_df["weightGeometricMean"] = np.sqrt(
    weight_synergy_df["molecularWeightDrug1"] * weight_synergy_df["molecularWeightDrug2"]
)

# Create a binary column for synergy and name it "synergistic"
weight_synergy_binary_df = weight_synergy_df.copy()  # Copy the original DataFrame

# Replace the synergy column with the binary "synergistic" column
weight_synergy_binary_df["synergy"] = weight_synergy_binary_df["synergy"].apply(lambda x: 1 if x > 0 else 0)

# Rename the synergy column to synergistic
weight_synergy_binary_df = weight_synergy_binary_df.rename(columns={"synergy": "synergistic"})

weight_synergy_df

Unnamed: 0,synergy,molecularWeightDrug1,molecularWeightDrug2,combinedWeight,weightDifference,weightRatio,weightGeometricMean
0,-2.3950,130.077223,384.237200,514.314423,254.159977,0.338534,223.563208
1,1.5075,130.077223,488.005540,618.082763,357.928317,0.266549,251.949212
2,8.2525,130.077223,393.435720,523.512943,263.358497,0.330619,226.223398
3,6.0575,130.077223,560.635900,690.713123,430.558677,0.232017,270.048072
4,4.9200,130.077223,581.057543,711.134766,450.980320,0.223863,274.922447
...,...,...,...,...,...,...,...
69259,-2.3040,433.494740,598.083140,1031.577880,164.588400,0.724807,509.181594
69260,-5.2390,433.494740,448.385380,881.880120,14.890640,0.966791,440.877198
69262,1.7320,532.561223,598.083140,1130.644363,65.521917,0.890447,564.372119
69263,-6.5290,532.561223,448.385380,980.946603,84.175843,1.187731,488.664165


## Smiles

In [10]:
def compute_tanimoto(smiles1, smiles2):
    try:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        fp1 = GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
        fp2 = GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
        return DataStructs.TanimotoSimilarity(fp1, fp2)
    except Exception as e:
        return None

def compare_molecular_weights(smiles1, smiles2):
    try:
        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)
        return abs(Descriptors.MolWt(mol1) - Descriptors.MolWt(mol2))
    except Exception as e:
        return None

# Compute TF-IDF Cosine Similarity
def compute_tfidf_cosine(smiles1, smiles2):
    try:
        corpus = [smiles1, smiles2]
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4))  # LINGO size of 4
        tfidf_matrix = vectorizer.fit_transform(corpus)
        cosine_similarity = (tfidf_matrix[0] @ tfidf_matrix[1].T).toarray()[0][0]
        return cosine_similarity
    except:
        return None

# Compute LINGO-based Similarity
def compute_lingo_similarity(smiles1, smiles2):
    try:
        lingo_set1 = set([smiles1[i:i+4] for i in range(len(smiles1)-3)])  # LINGO size 4
        lingo_set2 = set([smiles2[i:i+4] for i in range(len(smiles2)-3)])
        intersection = len(lingo_set1 & lingo_set2)
        union = len(lingo_set1 | lingo_set2)
        return intersection / union if union > 0 else 0
    except:
        return None

# Add the required metrics directly to smiles_synergy_df
smiles_synergy_df = merged_df[["synergy", "smilesStringDrug1", "smilesStringDrug2"]].copy()

# Add metrics to dataframe
smiles_synergy_df["tanimotoCoefficient"] = smiles_synergy_df.apply(
    lambda row: compute_tanimoto(row["smilesStringDrug1"], row["smilesStringDrug2"]), axis=1
)

smiles_synergy_df["tfidfCosineSimilarity"] = smiles_synergy_df.apply(
    lambda row: compute_tfidf_cosine(row["smilesStringDrug1"], row["smilesStringDrug2"]), axis=1
)

smiles_synergy_df["lingoSimilarity"] = smiles_synergy_df.apply(
    lambda row: compute_lingo_similarity(row["smilesStringDrug1"], row["smilesStringDrug2"]), axis=1
)

smiles_synergy_df["molecularWeightDifference"] = smiles_synergy_df.apply(
    lambda row: compare_molecular_weights(row["smilesStringDrug1"], row["smilesStringDrug2"]), axis=1
)

# Create a binary column for synergy and name it "synergistic"
smiles_synergy_binary_df = smiles_synergy_df.copy()  # Copy the original DataFrame

# Replace the synergy column with the binary "synergistic" column
smiles_synergy_binary_df["synergy"] = smiles_synergy_binary_df["synergy"].apply(lambda x: 1 if x > 0 else 0)

# Rename the synergy column to synergistic
smiles_synergy_binary_df = smiles_synergy_binary_df.rename(columns={"synergy": "synergistic"})

smiles_synergy_df



Unnamed: 0,synergy,smilesStringDrug1,smilesStringDrug2,tanimotoCoefficient,tfidfCosineSimilarity,lingoSimilarity,molecularWeightDifference
0,-2.3950,C1=C(C(=O)NC(=O)N1)F,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,0.045455,0.205098,0.096154,254.167
1,1.5075,C1=C(C(=O)NC(=O)N1)F,CC1=C(C(=CC=C1)Cl)NC(=O)C2=CN=C(S2)NC3=NC(=NC(...,0.037037,0.208968,0.142857,357.939
2,8.2525,C1=C(C(=O)NC(=O)N1)F,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...,0.031250,0.044753,0.052632,263.365
3,6.0575,C1=C(C(=O)NC(=O)N1)F,CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C...,0.060241,0.318135,0.118644,430.566
4,4.9200,C1=C(C(=O)NC(=O)N1)F,CS(=O)(=O)CCNCC1=CC=C(O1)C2=CC3=C(C=C2)N=CN=C3...,0.057471,0.083191,0.037975,450.991
...,...,...,...,...,...,...,...
69259,-2.3040,CC1=C(C(=C(C2=C1COC2=O)O)CC=C(C)CCC(=O)OCCN3CC...,CN(C)CCOC1=CC=C(C=C1)C(=C(CCCl)C2=CC=CC=C2)C3=...,0.111111,0.281645,0.202247,164.591
69260,-5.2390,CC1=C(C(=C(C2=C1COC2=O)O)CC=C(C)CCC(=O)OCCN3CC...,C1CC(=O)NC2=C1C=CC(=C2)OCCCCN3CCN(CC3)C4=C(C(=...,0.151515,0.217081,0.192771,14.893
69262,1.7320,CCN(CC)CCNC(=O)C1=C(NC(=C1C)C=C2C3=C(C=CC(=C3)...,CN(C)CCOC1=CC=C(C=C1)C(=C(CCCl)C2=CC=CC=C2)C3=...,0.123810,0.288970,0.220000,65.523
69263,-6.5290,CCN(CC)CCNC(=O)C1=C(NC(=C1C)C=C2C3=C(C=CC(=C3)...,C1CC(=O)NC2=C1C=CC(=C2)OCCCCN3CCN(CC3)C4=C(C(=...,0.099099,0.199677,0.151515,84.175


## protein

In [11]:
# Load the protein data
protein_data = pd.read_csv("drug_protein.csv")

# Map drugs to their protein sets
drug_to_proteins = protein_data.groupby("drug")["protein"].apply(set).to_dict()

protein_synergy_df = merged_df[["synergy", "drug1_db", "drug2_db"]].copy()


# Define a function to compute the number of common proteins
def compute_protein_difference(row):
    proteins1 = drug_to_proteins.get(row["drug1_db"], set())
    proteins2 = drug_to_proteins.get(row["drug2_db"], set())
    return len(proteins1 & proteins2)  # Intersection of the sets

# Add the proteinDifference column
protein_synergy_df["proteinsInCommon"] = protein_synergy_df.apply(compute_protein_difference, axis=1)

# Create a binary column for synergy and name it "synergistic"
protein_synergy_binary_df = protein_synergy_df.copy()  # Copy the original DataFrame

# Replace the synergy column with the binary "synergistic" column
protein_synergy_binary_df["synergy"] = protein_synergy_binary_df["synergy"].apply(lambda x: 1 if x > 0 else 0)

# Rename the synergy column to synergistic
protein_synergy_binary_df = protein_synergy_binary_df.rename(columns={"synergy": "synergistic"})

protein_synergy_df

Unnamed: 0,synergy,drug1_db,drug2_db,proteinsInCommon
0,-2.3950,DB00544,DB00188,0
1,1.5075,DB00544,DB01254,0
2,8.2525,DB00544,DB00530,0
3,6.0575,DB00544,DB02424,0
4,4.9200,DB00544,DB01259,0
...,...,...,...,...
69259,-2.3040,DB00688,DB00539,0
69260,-5.2390,DB00688,DB01238,0
69262,1.7320,DB01268,DB00539,0
69263,-6.5290,DB01268,DB01238,1


# Save all to csv

In [12]:
merged_df.to_csv("merged_drug_data.csv", index=False)

weight_synergy_df.to_csv("weight_synergy.csv", index=False)
weight_synergy_binary_df.to_csv("weight_synergy_binary.csv", index=False)

smiles_synergy_df.to_csv("smiles_synergy.csv", index=False)
smiles_synergy_binary_df.to_csv("smiles_synergy_binary.csv", index=False)

protein_synergy_df.to_csv("protein_synergy.csv", index=False)
protein_synergy_binary_df.to_csv("protein_synergy_binary.csv", index=False)