# Moa Prediction: Data understanding and visualization


**Info about the data files**

* **train_features.csv** - Features for the training set. Features g- signify gene expression data, and c- signify cell viability data. cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle); control perturbations have no MoAs; cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
* **train_targets_scored.csv** - The binary MoA targets that are scored.
* **train_targets_nonscored.csv** - Additional (optional) binary MoA responses for the training data. These are not predicted nor scored.
* **test_features.csv** - Features for the test data. You must predict the probability of each scored MoA for each row in the test data.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import time

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



data_dir = './datasets/'

In [2]:
dff_orig = pd.read_csv(data_dir+'train_features.csv')
dft_orig = pd.read_csv(data_dir+'train_targets_scored.csv')

In [3]:
print (dff_orig.shape)
dff_orig.head(2)

(23814, 876)


Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371


In [4]:
g_col = [f for f in dff_orig.columns.values if f[0]=='g']
print ( f"Total number of gene expression data features: {len(g_col)}" )
c_col = [f for f in dff_orig.columns.values if (f[0]=='c' and f[1]=='-')]
print ( f"Total number of cell viability data features: {len(c_col)}" )
rest_col = [f for f in dff_orig.columns.values if (f[1]!='-')]
print ( f"Rest of the features: {rest_col}" )

Total number of gene expression data features: 772
Total number of cell viability data features: 100
Rest of the features: ['sig_id', 'cp_type', 'cp_time', 'cp_dose']


In [5]:
print ("cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle)")
print (dff_orig['cp_type'].unique())

cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle)
['trt_cp' 'ctl_vehicle']


In [6]:
print ("cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).")
print (f"cp_time unique vals: {dff_orig['cp_time'].unique()}")
print (f"cp_dose unique vals: {dff_orig['cp_dose'].unique()}")

cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
cp_time unique vals: [24 72 48]
cp_dose unique vals: ['D1' 'D2']


In [7]:
dff_orig.describe()

Unnamed: 0,cp_time,g-0,g-1,g-2,g-3,g-4,g-5,g-6,g-7,g-8,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
count,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,...,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0
mean,48.020156,0.248366,-0.095684,0.152253,0.081971,0.057347,-0.138836,0.035961,-0.202651,-0.190083,...,-0.469244,-0.461411,-0.513256,-0.500142,-0.507093,-0.353726,-0.463485,-0.378241,-0.470252,-0.301505
std,19.402807,1.393399,0.812363,1.035731,0.950012,1.032091,1.179388,0.882395,1.125494,1.749885,...,2.000488,2.042475,2.001714,2.107105,2.159589,1.629291,2.059725,1.703615,1.834828,1.407918
min,24.0,-5.513,-5.737,-9.104,-5.998,-6.369,-10.0,-10.0,-10.0,-10.0,...,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0,-10.0
25%,24.0,-0.473075,-0.5622,-0.43775,-0.429575,-0.470925,-0.602225,-0.4939,-0.525175,-0.511675,...,-0.566175,-0.565975,-0.589975,-0.5687,-0.563775,-0.567975,-0.552575,-0.561,-0.5926,-0.5629
50%,48.0,-0.00885,-0.0466,0.0752,0.00805,-0.0269,-0.01565,-0.00065,-0.0179,0.01,...,-0.0099,0.00325,-0.0091,-0.01375,-0.0033,-0.01025,-0.00125,-0.0068,0.014,-0.0195
75%,72.0,0.5257,0.403075,0.663925,0.4634,0.465375,0.510425,0.528725,0.4119,0.549225,...,0.45775,0.4615,0.445675,0.4529,0.4709,0.44475,0.465225,0.4464,0.461275,0.43865
max,72.0,10.0,5.039,8.257,10.0,10.0,7.282,7.333,5.473,8.887,...,4.069,3.96,3.927,3.596,3.747,2.814,3.505,2.924,3.111,3.805


### targets

In [8]:
print (dft_orig.shape)
dft_orig.head(2)

(23814, 207)


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
dft_orig.describe()

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
count,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,...,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0,23814.0
mean,0.000714,0.000756,0.001008,0.007979,0.01264,0.003065,0.002268,0.004031,0.000504,0.011338,...,0.000252,0.00105,0.002016,0.01327,0.003065,0.000252,0.007139,0.001092,0.001638,0.00126
std,0.026709,0.027483,0.031731,0.088967,0.111716,0.055283,0.047566,0.063365,0.022443,0.105876,...,0.015871,0.032384,0.044851,0.114429,0.055283,0.015871,0.08419,0.033025,0.040436,0.035472
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
dft_orig.describe().loc['max'].values

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1.])

## Few Ideas

* First take all the features as X values and take one of the targets as Y and use some classifier.
* Then we can use multilabel classifier by inclusing mode targets at the same time.
* 

In [11]:
dfs_orig=pd.read_csv('datasets/sample_submission.csv')
dfs_orig.head(2)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,id_001897cda,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


### Early model

### Logistic Regressin with only one target

In [12]:
dff = dff_orig.drop(['sig_id', 'cp_type', 'cp_dose'], axis=1)

In [13]:
print ( dff.shape )
#dff.describe()

(23814, 873)


In [14]:
X = dff.values
X.shape

(23814, 873)

In [15]:
dft = dft_orig.drop('sig_id', axis=1)

In [16]:
dft.head(2)

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y = dft['acat_inhibitor'].values
y.shape

(23814,)

In [18]:


(X_train, X_val, y_train, y_val) = train_test_split(X, y, test_size=0.2)

In [19]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=500, verbose=1)
lr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s finished


LogisticRegression(max_iter=500, verbose=1)

In [20]:
ypreds_train = lr.predict(X_train)
ypreds_val = lr.predict(X_val)

In [21]:

acc_train = accuracy_score(y_train, ypreds_train)
acc_val   = accuracy_score(y_val, ypreds_val)
print (acc_train, acc_val)

1.0 0.9991601931555742


### Logistic Regressin with more than one target

https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html

In [22]:
dft.head(2)

Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
tmp = dft[dft.columns[:4]].values
tmp.shape

(23814, 4)

### Run Time analysis

In [28]:
def LRwith_Multiple_targets(Ntargets=2, dff=dff, dft=dft, Nrows=None):
    t0 = time.time()
 
    if Nrows:
        X = dff.values[:Nrows, :]
        y = dft[dft.columns[:Ntargets]].values[:Nrows, :]

    else: 
        X = dff.values
        y = dft[dft.columns[:Ntargets]].values
        
    #print ( f"X.shape: {X.shape}, y.shape:{y.shape}" ) 
    
    (X_train, X_val, y_train, y_val) = train_test_split(X, y, test_size=0.2)
    #print ( f"Train val splits: {X_train.shape[0]}/{X_val.shape[0]}")
    

    #clf = MultiOutputClassifier(KNeighborsClassifier())
    clf = MultiOutputClassifier(LogisticRegression(max_iter=1000) ) 
    clf.fit(X_train, y_train)
    
    ypreds_train = clf.predict(X_train)
    acc_train = accuracy_score(y_train, ypreds_train)

    ypreds_val = clf.predict(X_val)
    acc_val = accuracy_score(y_val, ypreds_val)

    #print (f"training accuracy:{acc_train} validation accuracy: {acc_val}")
    t1 = time.time()
    #print (f"Ntargets: {Ntargets}, time taken: {t1-t0}")
    
    return (np.round(100*acc_train, 2), np.round(100*acc_val,2), t1-t0)



In [25]:

results = []
for N in [1, 2, 5, 10]:
    (acc_train, acc_val, dt) = LRwith_Multiple_targets(Ntargets=N)
    results.append([N, acc_train, acc_val, dt] )

res_df = pd.DataFrame(results, columns=["Ntargets", "accuracy train (%)", "accuracy valid (%)", "time taken"])
res_df.set_index("Ntargets", inplace=True)
res_df
    

Unnamed: 0_level_0,accuracy train (%),accuracy valid (%),time taken
Ntargets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,100.0,99.9,0.941175
2,100.0,99.75,1.026269
5,100.0,95.95,7.823258
10,100.0,92.5,15.823974


In [26]:
results = []
for N in [1, 2, 5, 10]:
    (acc_train, acc_val, dt) = LRwith_Multiple_targets(Ntargets=N, Nrows=15000)
    results.append([N, acc_train, acc_val, dt] )

res_df = pd.DataFrame(results, columns=["Ntargets", "accuracy train (%)", "accuracy valid (%)", "time taken"])
res_df.set_index("Ntargets", inplace=True)
res_df
    

Unnamed: 0_level_0,accuracy train (%),accuracy valid (%),time taken
Ntargets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,100.0,99.9,1.640579
2,100.0,99.9,3.522015
5,99.72,94.4,16.886778
10,99.62,92.5,31.682517


In [27]:
results = []
for N in [1, 2, 5, 10]:
    (acc_train, acc_val, dt) = LRwith_Multiple_targets(Ntargets=N, Nrows=20000)
    results.append([N, acc_train, acc_val, dt] )

res_df = pd.DataFrame(results, columns=["Ntargets", "accuracy train (%)", "accuracy valid (%)", "time taken"])
res_df.set_index("Ntargets", inplace=True)
res_df
    

Unnamed: 0_level_0,accuracy train (%),accuracy valid (%),time taken
Ntargets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,100.0,99.92,3.128599
2,100.0,99.8,3.748001
5,98.92,94.9,23.13668
10,98.28,91.75,48.462351


In [29]:
results = []
for N in [1, 5, 10]:
    (acc_train, acc_val, dt) = LRwith_Multiple_targets(Ntargets=N)
    results.append([N, acc_train, acc_val, dt] )

res_df = pd.DataFrame(results, columns=["Ntargets", "accuracy train (%)", "accuracy valid (%)", "time taken"])
res_df.set_index("Ntargets", inplace=True)
res_df
    

Unnamed: 0_level_0,accuracy train (%),accuracy valid (%),time taken
Ntargets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,100.0,99.98,3.954542
5,98.36,95.36,34.06649
10,97.71,92.5,71.048851


In [40]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

X, y = make_multilabel_classification(n_samples=500, n_features=30, n_classes=10, random_state=0)
print (f"X.shape:{X.shape}, y.shape:{y.shape}")
clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)
ypreds = clf.predict(X[-2:])
print (y[-2:],'\n\n',ypreds)
accuracy_score(y[-2:], ypreds)

X.shape:(500, 30), y.shape:(500, 10)
[[0 0 1 0 1 0 0 1 0 0]
 [0 0 1 1 1 0 0 1 1 0]] 

 [[0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0]]


0.0