# Computing counts for selected compounds per team and target

In [1]:
import pandas as pd
import numpy as np
import copy

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
# Load data
df = pd.read_csv('../../merged_submission_lists/df_complete.csv')
df = df[df.in_selected_minusDB == 1] # we just consider selected compounds

print(df.shape)

df.head()

(21033, 16)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,smiles,owner_info,team,target_subm,list_pos,target_selected,from_gtm,from_medoids,from_topranked,in_selected,in_selected_minusDB,plannedToSynth,synthesized,hit,target_hit,further-submission-info
0,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,n,22,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
1,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,s,1335,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
2,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,nsp12,1256,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
3,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,nsp3,1832,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,
4,Oc1ccc2c(NC(=O)C3Cc4ccccc4C3)n[nH]c2c1,N_mlinch.csv-POS22+S_mlinch.csv-POS1335+nsp12_...,imolecule,nsp5,6196,n,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,,


In [4]:
# number of synthesized molecules
print(np.unique(df['smiles']).shape)

(11440,)


In [5]:
# teams
np.unique(df['team'].values)

array(['ai4science', 'aiwinter', 'belarus', 'cermn', 'covid19ddc',
       'deeplab', 'imolecule', 'jku', 'kyuken', 'lambdazero', 'lci',
       'luxscreen', 'nuwave', 'pharmai', 'safan', 'sarstroopers',
       'sarswars', 'virtualflow', 'way2drug', 'yoda'], dtype=object)

In [6]:
# targets
np.unique(df['target_selected'].values)

array(['n', 'nsp12', 'nsp3', 'nsp5', 's', 'tmprss2'], dtype=object)

In [7]:
# targets submitted
np.unique(df['target_subm'].values)

array(['aak1', 'furin', 'n', 'nsp1', 'nsp10-16', 'nsp12', 'nsp3', 'nsp5',
       's', 'tmprss2'], dtype=object)

In [8]:
# counts pert team and target
counts = df.groupby(['target_selected', 'team']).nunique()
counts = counts[['smiles']]
counts.columns = ['count']
counts = counts.reset_index(level='team')

counts.head()

Unnamed: 0_level_0,team,count
target_selected,Unnamed: 1_level_1,Unnamed: 2_level_1
n,ai4science,60
n,cermn,9
n,imolecule,1013
n,luxscreen,3
n,pharmai,5


In [9]:
# display count table

# create pivot table
count_table = pd.pivot_table(counts, values='count', index=['target_selected'], columns=['team'], fill_value=0)

# add missing teams
included_teams = list(count_table.columns)
for team in np.unique(df['team'].values):
    if team not in included_teams:
        count_table[team] = np.zeros(count_table.shape[0])

# sort targets
count_table = count_table.filter(items = ['n', 'nsp3', 'nsp5', 'nsp12', 's', 'tmprss2'], axis=0)

# sum row and column
team_sum = count_table.sum(axis=0)
team_sum.name = 'SUM'
count_table = count_table.append(team_sum)
count_table['SUM'] = count_table.sum(1)

count_table

team,ai4science,aiwinter,belarus,cermn,covid19ddc,deeplab,imolecule,jku,kyuken,lambdazero,lci,luxscreen,nuwave,pharmai,safan,sarstroopers,sarswars,virtualflow,way2drug,yoda,SUM
target_selected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n,60,0,0,9,0,0,1013,0,0,0,0,3,0,5,3,20,472,547,3,0,2135
nsp3,19,64,0,77,79,60,81,86,81,0,1150,5,39,16,67,61,2,53,78,69,2087
nsp5,14,88,32,82,55,69,71,259,52,32,700,76,26,43,81,58,86,112,59,90,2085
nsp12,25,1,1,5,82,1,358,63,0,0,60,331,9,15,209,215,299,372,57,1,2104
s,39,1,68,4,1,160,403,0,424,0,1,9,2,293,14,132,0,219,11,341,2122
tmprss2,499,63,0,626,0,2,17,0,0,0,2,256,4,3,7,29,0,487,98,2,2095
SUM,656,217,101,803,217,292,1943,408,557,32,1913,680,80,375,381,515,859,1790,306,503,12628
