In [121]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import plotly.express as px
from tqdm import tqdm

In [3]:
data_path = "./data/DrugCombDB/"

import os
print(os.getcwd())

# cells: GeneSym (code?), source, protein (id), cell (code), tissue, target_id, weight
df_cell_protein = pd.read_csv(data_path + "cell_protein.csv") 
# combinations: Drug1 (protein?), Drug2 (protein?), cell, drug1_db, drug2_db, synergy
df_drug_combinations = pd.read_csv(data_path + "drug_combinations.csv")
# drug proteins: drug(db format), protein (id)
df_drug_protein = pd.read_csv(data_path + "drug_protein.csv")

/Users/julianhesse/Documents/cs_master/swp_cancer/GraphSynergy_Swp


In [102]:
df_drug_protein

Unnamed: 0,drug,protein
0,DB01110,4843
1,DB01234,4843
2,DB08814,4843
3,DB00661,8911
4,DB00909,8911
...,...,...
5285,DB08865,10746
5286,DB08865,11183
5287,DB08865,11213
5288,DB04149,5141


In [105]:
df_drug_combinations['synergy'] = df_drug_combinations['synergy'].apply(lambda x: 0 if x <= 0 else 1)
df_drug_combinations

Unnamed: 0,Drug1,Drug2,cell,drug1_db,drug2_db,synergy
0,5-FU,BORTEZOMIB,A2058,DB00544,DB00188,0
1,5-FU,DASATINIB,A2058,DB00544,DB01254,1
2,5-FU,ERLOTINIB,A2058,DB00544,DB00530,1
3,5-FU,GELDANAMYCIN,A2058,DB00544,DB02424,1
4,5-FU,LAPATINIB,A2058,DB00544,DB01259,1
...,...,...,...,...,...,...
69431,Sunitinib malate,Toremifene citrate,HUH7,DB01268,DB00539,1
69432,Sunitinib malate,Aripiprazole,HUH7,DB01268,DB01238,0
69433,Toremifene citrate,Toremifene citrate,HUH7,DB00539,DB00539,0
69434,Toremifene citrate,Aripiprazole,HUH7,DB00539,DB01238,0


## Analyse Statistics of Cell Probabilities

In [13]:
counts = df_drug_combinations.groupby('cell')['synergy'].value_counts()
counts

cell   synergy
786O   0          965
       1          620
A2058  1           82
       0           16
A2780  1           94
                 ... 
VCAP   1           75
       0           23
WM115  0            1
ZR751  0           78
       1           20
Name: count, Length: 144, dtype: int64

In [38]:
counts.sort_index(axis=0)

cell   synergy
786O   0          965
       1          620
A2058  0           16
       1           82
A2780  0            4
                 ... 
VCAP   0           23
       1           75
WM115  0            1
ZR751  0           78
       1           20
Name: count, Length: 144, dtype: int64

In [22]:
counts_df = counts.sort_values()
type(counts_df)

pandas.core.series.Series

In [51]:
counts_df = counts.reset_index(level=[0,1])
counts_df

Unnamed: 0,cell,synergy,count
0,786O,0,965
1,786O,1,620
2,A2058,1,82
3,A2058,0,16
4,A2780,1,94
...,...,...,...
139,VCAP,1,75
140,VCAP,0,23
141,WM115,0,1
142,ZR751,0,78


In [66]:
counts_df = counts_df.pivot(index='cell', columns='synergy', values='count').reset_index().fillna(0)
counts_df

synergy,cell,0,1
0,786O,965.0,620.0
1,A2058,16.0,82.0
2,A2780,4.0,94.0
3,A375,27.0,72.0
4,A498,1107.0,482.0
...,...,...,...
68,UACC257,846.0,753.0
69,UACC62,1043.0,616.0
70,VCAP,23.0,75.0
71,WM115,1.0,0.0


In [89]:
counts_df['total'] = counts_df[0] + counts_df[1]
counts_df['0_relative'] = counts_df[0] / counts_df['total']
counts_df['1_relative'] = counts_df[1] / counts_df['total']
counts_df['imbalance'] = abs(0.5 - counts_df['0_relative']) * 2
counts_df

synergy,cell,0,1,total,0_relative,1_relative,inbalance,imbalance
0,786O,965.0,620.0,1585.0,0.608833,0.391167,0.217666,0.217666
1,A2058,16.0,82.0,98.0,0.163265,0.836735,0.673469,0.673469
2,A2780,4.0,94.0,98.0,0.040816,0.959184,0.918367,0.918367
3,A375,27.0,72.0,99.0,0.272727,0.727273,0.454545,0.454545
4,A498,1107.0,482.0,1589.0,0.696665,0.303335,0.393329,0.393329
...,...,...,...,...,...,...,...,...
68,UACC257,846.0,753.0,1599.0,0.529081,0.470919,0.058161,0.058161
69,UACC62,1043.0,616.0,1659.0,0.628692,0.371308,0.257384,0.257384
70,VCAP,23.0,75.0,98.0,0.234694,0.765306,0.530612,0.530612
71,WM115,1.0,0.0,1.0,1.000000,0.000000,1.000000,1.000000


In [151]:
counts_df.sort_values(by='0_relative', ascending=False)

synergy,cell,0,1,total,0_relative,1_relative,inbalance,imbalance
71,WM115,1.0,0.0,1.0,1.000000,0.000000,1.000000,1.000000
22,HUH7,20.0,1.0,21.0,0.952381,0.047619,0.904762,0.904762
72,ZR751,78.0,20.0,98.0,0.795918,0.204082,0.591837,0.591837
26,KPL1,74.0,24.0,98.0,0.755102,0.244898,0.510204,0.510204
45,NCIH522,1115.0,451.0,1566.0,0.712005,0.287995,0.424010,0.424010
...,...,...,...,...,...,...,...,...
1,A2058,16.0,82.0,98.0,0.163265,0.836735,0.673469,0.673469
14,ES2,15.0,83.0,98.0,0.153061,0.846939,0.693878,0.693878
11,COLO320,11.0,87.0,98.0,0.112245,0.887755,0.775510,0.775510
2,A2780,4.0,94.0,98.0,0.040816,0.959184,0.918367,0.918367


In [60]:
px.histogram(counts_df, x='cell', y='count', color='synergy', barmode='group')

In [100]:
px.bar(counts_df.sort_values(by='imbalance', ascending=False), y='imbalance', x='cell', width=1200)

In [101]:
((counts_df['imbalance'] / 2 + 0.5) * counts_df['total']).sum() / counts_df['total'].sum()

0.5918831729938361

## Analyse Statistics of Drug Proteins

In [212]:
# create new df
drug_proteins = list(set(df_drug_protein['drug']))

synergy_count = np.zeros((len(drug_proteins)), dtype=int)

drug_protein_synergy_df = pd.DataFrame(data={
    'drug': drug_proteins,
    0: synergy_count.copy(),
    1: synergy_count.copy()
})

In [213]:
drug_protein_synergy_df['drug'].str.match('DB00544')

0      False
1      False
2      False
3      False
4      False
       ...  
759    False
760    False
761    False
762    False
763    False
Name: drug, Length: 764, dtype: bool

In [214]:
drug_protein_synergy_df.loc[drug_protein_synergy_df['drug'].str.match('DB00544'), [0]]

Unnamed: 0,0
263,0


In [216]:
lookup = dict(zip(drug_protein_synergy_df['drug'], drug_protein_synergy_df.index))

# count synergies for
for i in tqdm(df_drug_combinations.index):
    row = df_drug_combinations.iloc[i]
    drug1 = lookup[row['drug1_db']]
    drug2 = lookup[row['drug2_db']]

    if row['synergy'] <= 0:
        drug_protein_synergy_df.loc[drug1, [0]] += 1
        drug_protein_synergy_df.loc[drug2, [0]] += 1
    else:
        drug_protein_synergy_df.loc[drug1, [1]] += 1
        drug_protein_synergy_df.loc[drug2, [1]] += 1
    
drug_protein_synergy_df

100%|██████████| 69436/69436 [01:33<00:00, 745.19it/s]


Unnamed: 0,drug,0,1
0,DB01667,0,0
1,DB01006,1118,1121
2,DB06228,0,1
3,DB01217,1291,1053
4,DB03796,0,1
...,...,...,...
759,DB06717,0,0
760,DB00308,0,1
761,DB01132,1,0
762,DB04786,1,0


In [218]:
# remove drug proteins not being in a drug combination
no_0s = drug_protein_synergy_df[0] == 0
no_1s = drug_protein_synergy_df[1] == 0
print((no_0s & no_1s).sum())
dropped = drug_protein_synergy_df[(no_0s & no_1s)]
drug_counts_df = drug_protein_synergy_df.drop(index=dropped.index)
dropped

192


Unnamed: 0,drug,0,1
0,DB01667,0,0
7,DB00500,0,0
11,DB00812,0,0
15,DB00717,0,0
20,DB00864,0,0
...,...,...,...
747,DB00440,0,0
750,DB00536,0,0
753,DB00991,0,0
756,DB01186,0,0


In [219]:
drug_counts_df['total'] = drug_counts_df[0] + drug_counts_df[1]
drug_counts_df['0_relative'] = drug_counts_df[0] / drug_counts_df['total']
drug_counts_df['1_relative'] = drug_counts_df[1] / drug_counts_df['total']
drug_counts_df['imbalance'] = abs(0.5 - drug_counts_df['0_relative']) * 2
drug_counts_df

Unnamed: 0,drug,0,1,total,0_relative,1_relative,imbalance
1,DB01006,1118,1121,2239,0.499330,0.500670,0.001340
2,DB06228,0,1,1,0.000000,1.000000,1.000000
3,DB01217,1291,1053,2344,0.550768,0.449232,0.101536
4,DB03796,0,1,1,0.000000,1.000000,1.000000
5,DB00790,0,1,1,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...
758,DB08899,0,1,1,0.000000,1.000000,1.000000
760,DB00308,0,1,1,0.000000,1.000000,1.000000
761,DB01132,1,0,1,1.000000,0.000000,1.000000
762,DB04786,1,0,1,1.000000,0.000000,1.000000


In [220]:
drug_counts_df.sort_values(by='imbalance', ascending=False)

Unnamed: 0,drug,0,1,total,0_relative,1_relative,imbalance
378,DB00495,0,1,1,0.000000,1.000000,1.000000
497,DB00692,0,1,1,0.000000,1.000000,1.000000
495,DB00876,0,1,1,0.000000,1.000000,1.000000
494,DB00278,0,1,1,0.000000,1.000000,1.000000
493,DB04835,0,1,1,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...
580,DB00675,1157,1104,2261,0.511720,0.488280,0.023441
675,DB00553,1115,1129,2244,0.496881,0.503119,0.006239
538,DB08828,1314,1327,2641,0.497539,0.502461,0.004922
369,DB01229,1199,1191,2390,0.501674,0.498326,0.003347


In [222]:
px.bar(drug_counts_df.sort_values(by='imbalance', ascending=False), y='imbalance', x='drug', width=1200)

In [223]:
((drug_counts_df['imbalance'] / 2 + 0.5) * drug_counts_df['total']).sum() / drug_counts_df['total'].sum()

0.5983351575551588

## Calculate joint probability

In [224]:
df_drug_combinations

Unnamed: 0,Drug1,Drug2,cell,drug1_db,drug2_db,synergy
0,5-FU,BORTEZOMIB,A2058,DB00544,DB00188,0
1,5-FU,DASATINIB,A2058,DB00544,DB01254,1
2,5-FU,ERLOTINIB,A2058,DB00544,DB00530,1
3,5-FU,GELDANAMYCIN,A2058,DB00544,DB02424,1
4,5-FU,LAPATINIB,A2058,DB00544,DB01259,1
...,...,...,...,...,...,...
69431,Sunitinib malate,Toremifene citrate,HUH7,DB01268,DB00539,1
69432,Sunitinib malate,Aripiprazole,HUH7,DB01268,DB01238,0
69433,Toremifene citrate,Toremifene citrate,HUH7,DB00539,DB00539,0
69434,Toremifene citrate,Aripiprazole,HUH7,DB00539,DB01238,0


In [225]:
counts_df

synergy,cell,0,1,total,0_relative,1_relative,inbalance,imbalance
0,786O,965.0,620.0,1585.0,0.608833,0.391167,0.217666,0.217666
1,A2058,16.0,82.0,98.0,0.163265,0.836735,0.673469,0.673469
2,A2780,4.0,94.0,98.0,0.040816,0.959184,0.918367,0.918367
3,A375,27.0,72.0,99.0,0.272727,0.727273,0.454545,0.454545
4,A498,1107.0,482.0,1589.0,0.696665,0.303335,0.393329,0.393329
...,...,...,...,...,...,...,...,...
68,UACC257,846.0,753.0,1599.0,0.529081,0.470919,0.058161,0.058161
69,UACC62,1043.0,616.0,1659.0,0.628692,0.371308,0.257384,0.257384
70,VCAP,23.0,75.0,98.0,0.234694,0.765306,0.530612,0.530612
71,WM115,1.0,0.0,1.0,1.000000,0.000000,1.000000,1.000000


In [226]:
probs = {
    'cells': [],
    'drug1': [],
    'drug2': [],
    'avg': [],
    'median':[],
    'max': []
}
for i in tqdm(df_drug_combinations.index):
    row = df_drug_combinations.iloc[i]
    # print(row)
    cell_prob = counts_df.loc[counts_df['cell'].str.match(row['cell']), ['1_relative']].to_numpy()[0][0]
    drug1_prob = drug_counts_df.loc[drug_counts_df['drug'].str.match(row['drug1_db']), ['1_relative']].to_numpy()[0][0]
    drug2_prob = drug_counts_df.loc[drug_counts_df['drug'].str.match(row['drug2_db']), ['1_relative']].to_numpy()[0][0]
    # print(cell_prob, drug1_prob, drug2_prob)
    probs['cells'].append(cell_prob)
    probs['drug1'].append(drug1_prob)
    probs['drug2'].append(drug2_prob)
    probs['avg'].append(np.average([cell_prob, drug1_prob, drug2_prob]))
    probs['median'].append(np.median([cell_prob, drug1_prob, drug2_prob]))
    probs['max'].append(np.max([cell_prob, drug1_prob, drug2_prob]))


100%|██████████| 69436/69436 [01:13<00:00, 944.02it/s]


In [227]:
# average acc
avg = np.array(probs['avg'])
avg[avg > 0.5] = 1
avg[avg <= 0.5] = 0
avg = avg.astype(int)

(avg == df_drug_combinations['synergy']).sum()/len(avg)

0.6481508151391209

In [228]:
# median acc
median = np.array(probs['median'])
median[median > 0.5] = 1
median[median <= 0.5] = 0
median = median.astype(int)

(median == df_drug_combinations['synergy']).sum()/len(median)

0.6377959559882481

In [229]:
# max acc
max = np.array(probs['max'])
max[max > 0.5] = 1
max[max <= 0.5] = 0
max = max.astype(int)

(max == df_drug_combinations['synergy']).sum()/len(max)

0.5689555849991359

In [230]:
# experiment acc
combined_drugs = (np.array(probs['drug1']) + np.array(probs['drug2'])) / 2
combined_drugs[combined_drugs > 0.5] = 1
combined_drugs[combined_drugs <= 0.5] = 0
combined_drugs = combined_drugs.astype(int)

(combined_drugs == df_drug_combinations['synergy']).sum()/len(combined_drugs)

0.6408347255026211

In [231]:
# experiment acc
combined_drugs = (np.array(probs['cells']) + np.array(probs['drug2'])) / 2
combined_drugs[combined_drugs > 0.5] = 1
combined_drugs[combined_drugs <= 0.5] = 0
combined_drugs = combined_drugs.astype(int)

(combined_drugs == df_drug_combinations['synergy']).sum()/len(combined_drugs)

0.6187424390805922

In [232]:
# experiment acc
combined_drugs = (np.array(probs['drug1']) + np.array(probs['cells'])) / 2
combined_drugs[combined_drugs > 0.5] = 1
combined_drugs[combined_drugs <= 0.5] = 0
combined_drugs = combined_drugs.astype(int)

(combined_drugs == df_drug_combinations['synergy']).sum()/len(combined_drugs)

0.6277579353649404

In [233]:
combined_drugs = np.array(probs['drug1'])
combined_drugs[combined_drugs > 0.5] = 1
combined_drugs[combined_drugs <= 0.5] = 0
combined_drugs = combined_drugs.astype(int)

(combined_drugs == df_drug_combinations['synergy']).sum()/len(combined_drugs)

0.6046863298577106

In [234]:
combined_drugs = np.array(probs['drug2'])
combined_drugs[combined_drugs > 0.5] = 1
combined_drugs[combined_drugs <= 0.5] = 0
combined_drugs = combined_drugs.astype(int)

(combined_drugs == df_drug_combinations['synergy']).sum()/len(combined_drugs)

0.5919839852526068

In [235]:
combined_drugs = np.array(probs['cells'])
combined_drugs[combined_drugs > 0.5] = 1
combined_drugs[combined_drugs <= 0.5] = 0
combined_drugs = combined_drugs.astype(int)

(combined_drugs == df_drug_combinations['synergy']).sum()/len(combined_drugs)

0.5918831729938361

In [None]:
np.av