In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/sample_submission.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/kaggle/input/lish-moa/train_targets_nonscored.csv
/kaggle/input/train-folds/train_folds.csv


In [2]:
# !pip install iterative-stratification
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [3]:
import pandas as pd
import numpy as np
import optuna
from functools import partial

In [4]:
# df_train_folds=pd.read_csv("../input/lish-moa/train_targets_scored.csv")
# df_train_folds.loc[:,"kfold"]=-1
# df_train_folds=df_train_folds.sample(frac=1).reset_index(drop=True)
# targets=df_train_folds.drop("sig_id",axis=1).values

# mskf=MultilabelStratifiedKFold(n_splits=5)
# for fold,(trn,val) in enumerate(mskf.split(X=df_train_folds,y=targets)):
#     df_train_folds.loc[val,"kfold"]=fold

# df_train_folds.to_csv("train_folds.csv",index=False)  
df_train_folds=pd.read_csv("../input/train-folds/train_folds.csv")

In [5]:
##Dataset
import torch
import torch.nn as nn

class MoaDataset:
    def __init__(self,dataset,features):
        self.dataset=dataset
        self.feature=features
    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self,item):
        return {
            "x":torch.tensor(self.dataset[item,:],dtype=torch.float),
            "y":torch.tensor(self.feature[item,:],dtype=torch.float)
        }


In [6]:
## Model
class Model(nn.Module):
    def __init__(self,num_features,num_targets,num_layers,hidden_size,dropout):
        super().__init__()
        layers=[]
        for _ in range(num_layers):
            if len(layers)==0:
                layers.append(nn.Linear(num_features,hidden_size))
                layers.append(nn.BatchNorm1d(hidden_size))
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
            else:
                layers.append(nn.Linear(hidden_size,hidden_size))
                layers.append(nn.BatchNorm1d(hidden_size))
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_size,num_targets))

        self.model=nn.Sequential(*layers)

    def forward(self,x):
        x=self.model(x)
        return x
    
    

In [7]:
class MyUtil:
    def __init__():
        pass
    
    def add_dummies(data,column):
        new_data=pd.get_dummies(data[column])
        cols_new=[f"{column}_{c}" for c in new_data.columns]
        new_data.columns=cols_new
        data=data.drop(column,axis=1)
        data=data.join(new_data)
        return data


In [8]:
from sklearn.preprocessing import Normalizer

path="/kaggle/input/lish-moa/"
file="train_features.csv"
df_features=pd.read_csv(path+file)
path="/kaggle/input/lish-moa/"
file="test_features.csv"
df_test=pd.read_csv(path+file)

num_train=df_features.shape[0]
df_features=pd.concat([df_features,df_test],ignore_index=True)

df_features=MyUtil.add_dummies(df_features,"cp_time")
df_features=MyUtil.add_dummies(df_features,"cp_dose")
df_features["g_mean"]=df_features[[x for x in df_features.columns if "g-" in x]].mean(axis=1)
df_features["c_mean"]=df_features[[x for x in df_features.columns if "c-" in x]].mean(axis=1)

sig_id=df_features.sig_id
df_features=df_features.drop(["sig_id","cp_type"],axis=1)
nor=Normalizer().fit(df_features)
df_features=pd.DataFrame(nor.transform(df_features),columns=df_features.columns)
df_features["sig_id"]=sig_id

df_test=df_features.iloc[num_train:,:]
df_features=df_features.iloc[:num_train,:]
df_scored=df_train_folds

In [9]:
## Engine

class Engine:
    def __init__(self,model,optimizer,device):
        self.model=model
        self.optimizer=optimizer
        self.device=device   
    
    def loss_fn(self,targets,outputs):
        return nn.BCEWithLogitsLoss()(outputs,targets)
    
    def train(self,data_loader):
        self.model.train()
        final_loss=0
        for data in data_loader:
            self.optimizer.zero_grad()
            inputs=data["x"].to(self.device)
            targets=data["y"].to(self.device)
            outputs=self.model(inputs)
            loss=self.loss_fn(targets,outputs)
            loss.backward()
            self.optimizer.step()
            final_loss += loss.item()
        return final_loss/len(data_loader)
    
    def validate(self,data_loader):
        self.model.eval()
        final_loss=0
        for data in data_loader:
            inputs=data["x"].to(self.device)
            targets=data["y"].to(self.device)
            outputs=self.model(inputs)
            loss=self.loss_fn(targets,outputs)
            final_loss += loss.item()
        return final_loss/len(data_loader)
    
    def predict(self,data_loader):
        self.model.eval()
        final_predictions = []
        for data in data_loader:
            inputs=data["x"].to(self.device)
            predictions = self.model(inputs)
            predictions = predictions.sigmoid().cpu()
            final_predictions.append(predictions.detach().numpy())
        return final_predictions
        


In [10]:
def train(fold,save_model=False):
    # select cols
    targets_cols=df_scored.drop(["sig_id","kfold"],axis=1).columns
    features_cols=df_features.drop(["sig_id"],axis=1).columns

    # Data Merge
    df_all=df_features.merge(df_scored,on="sig_id",how="left")

    # Dataset
    train_df=df_all[df_all.kfold!=fold].reset_index(drop=True)
    valid_df=df_all[df_all.kfold==fold].reset_index(drop=True)

    x_train=train_df[features_cols].to_numpy()
    x_valid=valid_df[features_cols].to_numpy()
    y_train=train_df[targets_cols].to_numpy()
    y_valid=valid_df[targets_cols].to_numpy()

    # DataLoader
    train_dataset=MoaDataset(x_train,y_train)
    train_loader=torch.utils.data.DataLoader(
        train_dataset,batch_size=1024,num_workers=8,shuffle=True
    )
    valid_dataset=MoaDataset(x_valid,y_valid)
    valid_loader=torch.utils.data.DataLoader(
        valid_dataset,batch_size=1024,num_workers=8,shuffle=False
    )
    
    # Model,Optimizer, scheduler, engine
    model=Model(
        num_features=x_train.shape[1],
        num_targets=y_train.shape[1],
        num_layers=5,
        hidden_size=2048,
        dropout=0.3        
    )
    
    device="cuda" if torch.cuda.is_available() else "cpu"
    print(device,f"inputs:{x_train.shape[1]}, targets:{y_train.shape[1]}")
    model.to(device)
    optimizer=torch.optim.Adam(model.parameters(),lr=3e-4)
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,patience=3,threshold=1e-5,mode="min",verbose=True
    )

    engine=Engine(model,optimizer,device)
    best_loss=np.inf
    early_stopping=10
    early_stopping_cnt=0
    EPOCH=300
    for epoch in range(EPOCH):
        train_loss=engine.train(train_loader)
        valid_loss=engine.validate(valid_loader)
        scheduler.step(valid_loss)

        if valid_loss<best_loss:
            best_loss=valid_loss
            torch.save(model.state_dict(),f"model_fold_{fold}.bin")
            print(f"fold={fold}, epoch={epoch}, train_loss={train_loss:.6f}, valid_loss={valid_loss:.6f}")    
            early_stopping_cnt=0
        else:
            early_stopping_cnt+=1
        if early_stopping_cnt>early_stopping:
            break

    print(f"fold={fold}, best val loss={best_loss}")

In [11]:
def predict_upload(fold):
    df=df_test#pd.read_csv("./test_dummies.csv")
    features_cols=df.drop(["sig_id"],axis=1).columns
    x_test=df[features_cols].to_numpy()
    y_test=np.zeros((df.shape[0],206))
    test_dataset=MoaDataset(x_test,y_test)
    test_loader=torch.utils.data.DataLoader(
        test_dataset,batch_size=1024,num_workers=8,shuffle=False
    )
    

    device="cuda" if torch.cuda.is_available() else "cpu"
    model=Model(
        num_features=x_test.shape[1],
        num_targets=y_test.shape[1],
        num_layers=5,
        hidden_size=2048,
        dropout=0.3        
    )
    
    device="cuda" if torch.cuda.is_available() else "cpu"
    print(device,f"inputs:{x_test.shape[1]}, targets:{y_test.shape[1]}")

    model_save_path=f"./model_fold_{fold}.bin"
    model.load_state_dict(torch.load(model_save_path))
    model=model.to(device)
    
    engine=Engine(model,None,device)
    preds=engine.predict(test_loader)
    preds=np.vstack(preds)
    return preds

In [12]:
model=train(0,save_model=True)
p1 = predict_upload(0)

model=train(1,save_model=True)
p2 = predict_upload(1)

model=train(2,save_model=True)
p3 = predict_upload(2)

model=train(3,save_model=True)
p4 = predict_upload(3)

model=train(4,save_model=True)
p5 = predict_upload(4)

cuda inputs:879, targets:206
fold=0, epoch=0, train_loss=0.219413, valid_loss=0.191079
fold=0, epoch=1, train_loss=0.028086, valid_loss=0.036350
fold=0, epoch=2, train_loss=0.021112, valid_loss=0.024614
fold=0, epoch=3, train_loss=0.019457, valid_loss=0.020825
fold=0, epoch=4, train_loss=0.018410, valid_loss=0.019827
fold=0, epoch=5, train_loss=0.017626, valid_loss=0.018620
fold=0, epoch=6, train_loss=0.016958, valid_loss=0.017981
fold=0, epoch=7, train_loss=0.016373, valid_loss=0.017460
fold=0, epoch=8, train_loss=0.015855, valid_loss=0.017036
fold=0, epoch=9, train_loss=0.015363, valid_loss=0.016790
fold=0, epoch=10, train_loss=0.014831, valid_loss=0.016738
fold=0, epoch=11, train_loss=0.014335, valid_loss=0.016479
fold=0, epoch=12, train_loss=0.013810, valid_loss=0.016454
fold=0, epoch=13, train_loss=0.013239, valid_loss=0.016408
Epoch    18: reducing learning rate of group 0 to 3.0000e-05.
Epoch    22: reducing learning rate of group 0 to 3.0000e-06.
fold=0, best val loss=0.0164079

In [13]:
predictions = (p1 + p2 + p3 + p4 + p5) / 5
sample=pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")
sample.iloc[:,1:] = predictions
sample.to_csv('submission.csv', index=False)

In [14]:
my=pd.read_csv("submission.csv")

In [15]:
sample

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001925,0.002137,0.002105,0.019250,0.020607,0.004406,0.003652,0.007776,0.001285,...,0.001465,0.002898,0.002937,0.000936,0.001265,0.001684,0.001260,0.002381,0.004634,0.002537
1,id_001897cda,0.001449,0.001946,0.002855,0.002968,0.002278,0.004176,0.003435,0.010954,0.003432,...,0.001808,0.003245,0.003715,0.000798,0.018669,0.002273,0.014027,0.002135,0.005617,0.002916
2,id_002429b5b,0.000425,0.000405,0.001178,0.005796,0.003791,0.001265,0.002050,0.000814,0.000590,...,0.000883,0.000763,0.001839,0.003829,0.002559,0.000773,0.002411,0.001310,0.001169,0.001362
3,id_00276f245,0.000767,0.000716,0.001411,0.005262,0.001309,0.003326,0.001552,0.001700,0.000894,...,0.001195,0.001788,0.001837,0.021892,0.004601,0.000922,0.001491,0.001560,0.001979,0.001474
4,id_0027f1083,0.001648,0.001202,0.002150,0.011369,0.005787,0.002417,0.003552,0.001861,0.001250,...,0.001627,0.001465,0.003975,0.002748,0.001311,0.001212,0.002446,0.001998,0.002115,0.001684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3977,id_ff7004b87,0.001297,0.001245,0.001265,0.002179,0.005801,0.001876,0.001487,0.003505,0.001556,...,0.001182,0.003781,0.002201,0.029117,0.014772,0.001088,0.007035,0.001478,0.001722,0.001459
3978,id_ff925dd0d,0.002933,0.002355,0.001647,0.008402,0.020731,0.004801,0.005061,0.004537,0.001542,...,0.001157,0.001317,0.003058,0.000919,0.001537,0.001633,0.004170,0.001659,0.002725,0.001559
3979,id_ffb710450,0.001696,0.001399,0.001312,0.009131,0.100891,0.008452,0.003396,0.004886,0.001142,...,0.001077,0.001360,0.001652,0.000607,0.001585,0.001057,0.000806,0.001169,0.001641,0.001681
3980,id_ffbb869f2,0.001031,0.001206,0.001459,0.026682,0.018192,0.003014,0.002495,0.002572,0.001288,...,0.000913,0.000890,0.002983,0.000635,0.001419,0.001042,0.001689,0.001791,0.002265,0.002421
