# Computing counts for selected compounds per team and target

In [36]:
import pandas as pd
import numpy as np
import copy

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [26]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [27]:
# Load data
df = pd.read_csv('../merged_submission_lists/df_complete.csv')
df = df[df.in_selected_minusDB == 1] # we just consider synthesized compounds

print(df.shape)

df.head()

(21033, 16)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,smiles,owner_info,team,target_subm,list_pos,target_selected,from_gtm,from_medoids,from_topranked,in_selected,in_selected_minusDB,plannedToSynth,synthesized,hit,target_hit,further-submission-info
0,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,n,22,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
1,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,s,1335,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
2,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,nsp12,1256,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
3,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,nsp3,1832,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
4,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,nsp5,6196,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,


In [28]:
# number of synthesized molecules
print(np.unique(df['smiles']).shape)

(11440,)


In [29]:
# teams
np.unique(df['team'].values)

array(['ai4science', 'aiwinter', 'belarus', 'cermn', 'covid19ddc',
       'deeplab', 'imolecule', 'jku', 'kyuken', 'lambdazero', 'lci',
       'luxscreen', 'nuwave', 'pharmai', 'safan', 'sarstroopers',
       'sarswars', 'virtualflow', 'way2drug', 'yoda'], dtype=object)

In [30]:
# targets
np.unique(df['target_selected'].values)

array(['n', 'nsp12', 'nsp3', 'nsp5', 's', 'tmprss2'], dtype=object)

In [31]:
# targets submitted
np.unique(df['target_subm'].values)

array(['aak1', 'furin', 'n', 'nsp1', 'nsp10-16', 'nsp12', 'nsp3', 'nsp5',
       's', 'tmprss2'], dtype=object)

In [32]:
# Only consider rows for which selected and the submitted targets match
df = df[df['target_subm'] == df['target_selected']]

In [33]:
# counts pert team and target
counts = df.groupby(['target_selected', 'team']).nunique()
counts = counts[['smiles']]
counts.columns = ['count']
counts = counts.reset_index(level='team')

counts.head()

Unnamed: 0_level_0,team,count
target_selected,Unnamed: 1_level_1,Unnamed: 2_level_1
n,imolecule,1013
n,sarswars,472
n,virtualflow,547
nsp12,covid19ddc,82
nsp12,imolecule,358


In [37]:
# display count table

# create pivot table
count_table = pd.pivot_table(counts, values='count', index=['target_selected'], columns=['team'], fill_value=0)

# add missing teams
included_teams = list(count_table.columns)
for team in np.unique(df['team'].values):
    if team not in included_teams:
        count_table[team] = np.zeros(count_table.shape[0])

# sort targets
count_table = count_table.filter(items = ['n', 'nsp3', 'nsp5', 'nsp12', 's', 'tmprss2'], axis=0)

# sum row and column
team_sum = count_table.sum(axis=0)
team_sum.name = 'SUM'
count_table = count_table.append(team_sum)
count_table['SUM'] = count_table.sum(1)

count_table

team,ai4science,aiwinter,belarus,cermn,covid19ddc,deeplab,imolecule,jku,kyuken,lambdazero,lci,luxscreen,nuwave,pharmai,safan,sarstroopers,sarswars,virtualflow,way2drug,yoda,SUM
target_selected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n,0,0,0,0,0,0,1013,0,0,0,0,0,0,0,0,0,472,547,0,0,2032
nsp3,0,64,0,73,79,60,81,86,81,0,1150,0,39,0,63,56,0,46,71,69,2018
nsp5,0,88,32,80,55,69,66,259,52,32,700,73,24,42,80,48,85,107,53,90,2035
nsp12,0,0,0,0,82,0,358,57,0,0,60,323,0,0,205,211,298,369,55,0,2018
s,0,0,67,0,0,160,402,0,424,0,0,0,0,288,0,108,0,219,0,337,2005
tmprss2,499,63,0,621,0,0,0,0,0,0,0,255,0,0,0,0,0,463,97,0,1998
SUM,499,215,99,774,216,289,1920,402,557,32,1910,651,63,330,348,423,855,1751,276,496,12106
