In [1]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('../input/iterative-stratification/iterative-stratification-master')
# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


In [2]:
# Read data
data_dir = '../data/'
train_features = pd.read_csv(data_dir + 'train_features.csv')
train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(data_dir + 'train_targets_nonscored.csv')

test_features = pd.read_csv(data_dir + 'test_features.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [3]:
# select Genes col and cells col
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]
target_cols = train_targets_scored.drop('sig_id', axis=1).columns.values.tolist()

In [4]:
# global seed for every envirment
global_random_seed = 42
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=global_random_seed)

# Remove outliers

In [7]:
# normalize data, drop outliers beyond 4 times standard
train_ = train_features.copy()
drop_set = set()
for col in GENES:
    
    mean = train_[col].mean()
    std = train_[col].std()

    std_r = mean + 4*std
    std_l = mean - 4*std

    drop_set = drop_set | set(train_[col][(train_[col]>std_r) | (train_[col]<std_l)].index.values)

train_features = train_features.drop(drop_set).reset_index(drop=True)
train_targets_scored = train_targets_scored.drop(drop_set).reset_index(drop=True)


# PCA features + Existing features

In [8]:
# Because there are lots of genes expression and cols, some of them are highly correlated
# which means we can cut off some unnecessary features to make data cleaner
# GENES
n_comp = 50

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
data2 = (PCA(n_components=n_comp, random_state=global_random_seed).fit_transform(data[GENES]))
# split the pca processed file back to train and test
train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]

# then use the pca sampled data to generate a new dataframe
train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])
# concat the PCA processed df back to original one
# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [9]:
#CELLS
n_comp = 15

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[CELLS]))

train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

# feature Selection using Variance Encoding

In [10]:
from sklearn.feature_selection import VarianceThreshold
# use variance threshold to collect columns
# remove low variance columns and features
var_thresh = VarianceThreshold(threshold=0.5)
# data = train_features.append(test_features)
data = pd.concat([train_features, test_features], axis = 0)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

# 
train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

train_features

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,0,1,2,3,4,5,...,912,913,914,915,916,917,918,919,920,921
0,id_0039a2ff9,trt_cp,48,D2,-0.2924,0.0985,-0.5631,-0.3963,0.1672,-0.8124,...,-1.286635,-0.807944,-0.254147,-0.769551,0.374694,-0.943341,-0.853739,-1.020810,1.268631,-0.508160
1,id_005c3cb48,trt_cp,48,D1,0.1088,-0.0945,-0.0345,-0.0853,-0.6953,0.0164,...,-0.700414,0.264050,0.467550,0.466465,0.321766,-0.096887,-0.585788,-0.442236,1.458439,-0.131257
2,id_00a6e782e,trt_cp,48,D1,-0.7053,-0.2772,1.0630,0.6516,0.0751,1.2170,...,0.014457,-0.824938,-0.280325,0.422267,0.833339,-0.207679,0.315848,0.091144,0.867342,0.119282
3,id_00cf304ae,trt_cp,24,D2,-0.9906,-0.0223,-0.7852,0.1550,0.0439,-0.2081,...,2.618133,-0.930276,0.359080,-0.518467,0.145251,-0.821268,-0.958771,0.592624,0.060834,0.903672
4,id_00f08ca12,trt_cp,48,D2,-0.0323,-0.8249,-0.2119,0.0735,0.0658,0.3392,...,0.191623,0.397117,-1.023402,-1.146402,0.945452,0.569545,0.278772,0.125129,-0.341397,0.228540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384,id_ff34140bb,trt_cp,48,D2,-0.0704,-1.1450,-0.0736,0.1689,-0.1952,0.9004,...,0.863094,0.530982,-0.325180,1.104800,-1.140337,-0.461557,-0.384257,-0.007216,0.205978,-0.096774
1385,id_ff678b430,trt_cp,24,D1,0.2191,-0.3436,0.7492,0.3498,2.8270,-1.0520,...,1.678875,0.809328,-1.036321,-0.532753,0.924138,0.311458,-0.897197,-0.087098,0.115241,0.024159
1386,id_ff96bcd4d,trt_cp,48,D1,-0.8801,-0.2296,-0.7150,0.5461,0.5096,-0.6187,...,-0.922724,0.663234,-0.067335,0.504786,-1.231724,0.497414,-0.095706,0.154999,-1.038228,0.024232
1387,id_ffa27d492,trt_cp,48,D2,0.3963,-0.3586,-0.7736,-0.7709,0.1802,-0.3832,...,-0.491864,0.492831,0.490924,-0.468890,0.250088,0.675397,0.743152,0.371395,-0.186513,-0.148246


# Binning

In [11]:
# for col in GENES:
#     train.loc[:, f'{col}_bin'] = pd.cut(train[col], bins=3, labels=False)
#     test.loc[:, f'{col}_bin'] = pd.cut(test[col], bins=3, labels=False)


# Distribution plots

In [12]:
# plt.figure(figsize=(16,16))
# sns.set_style("whitegrid")

# gene_choice = np.random.choice(len(GENES), 16)
# for i, col in enumerate(gene_choice):
#     plt.subplot(4, 4, i+1)
#     plt.hist(train_features.loc[:, GENES[col]],bins=100, color='orange')
#     plt.title(GENES[col])

In [13]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

train.drop(columns=['cp_type'], axis = 1, inplace=True)
test.drop(columns=['cp_type'], axis = 1, inplace=True)

target = train[train_targets_scored.columns]

# CV folds

In [14]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=5)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)
folds

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_00cf304ae,24,D2,-0.9906,-0.0223,-0.7852,0.1550,0.0439,-0.2081,0.0270,...,0,0,0,0,0,0,0,0,0,4
1,id_016f18b33,48,D1,-0.5924,-0.3747,-0.2318,-0.0164,-0.7001,0.2946,-1.4090,...,0,0,0,0,0,0,0,0,0,2
2,id_018fc6bfa,24,D1,-0.0788,0.2323,1.1880,0.1744,0.4090,0.0857,-0.2272,...,0,0,0,0,0,0,0,0,0,4
3,id_019a1ab73,48,D1,-0.8460,-1.1200,0.0532,1.3220,0.0357,1.6190,-0.5884,...,0,0,0,0,0,0,0,0,0,0
4,id_02973bcb5,48,D1,0.2723,0.9656,-0.6440,0.5406,1.3150,1.4570,0.5476,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
961,id_ff34140bb,48,D2,-0.0704,-1.1450,-0.0736,0.1689,-0.1952,0.9004,-1.3220,...,0,0,0,0,0,0,0,0,0,3
962,id_ff678b430,24,D1,0.2191,-0.3436,0.7492,0.3498,2.8270,-1.0520,0.2731,...,0,0,0,0,0,0,0,0,0,0
963,id_ff96bcd4d,48,D1,-0.8801,-0.2296,-0.7150,0.5461,0.5096,-0.6187,-0.4340,...,0,0,0,0,0,0,0,0,0,4
964,id_ffa27d492,48,D2,0.3963,-0.3586,-0.7736,-0.7709,0.1802,-0.3832,-0.1220,...,0,0,0,0,0,0,0,0,0,0


In [15]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(966, 1131)
(966, 1132)
(3624, 925)
(966, 207)
(3982, 207)


# Dataset Classes

In [16]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :].astype(np.float32), dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :].astype(np.float32), dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :].astype(np.float32), dtype=torch.float)
        }
        return dct
    

In [17]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds
   
    

# Model

In [18]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

class AdvModel(nn.Module):
    def __init__(self, num_features, num_targets):
        super(AdvModel, self).__init__()
        self.hidden_size = [1500, 1250, 1000, 750]
        self.dropout_value = [0.5, 0.35, 0.3, 0.25]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = nn.Linear(self.hidden_size[2], self.hidden_size[3])

        self.batch_norm5 = nn.BatchNorm1d(self.hidden_size[3])
        self.dropout5 = nn.Dropout(self.dropout_value[3])
        self.dense5 = nn.utils.weight_norm(nn.Linear(self.hidden_size[3], num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = F.leaky_relu(self.dense4(x))

        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.dense5(x)
        return x

# Preprocessing steps

In [19]:
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
#     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
#     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

# --------------------- Normalize ---------------------
#     for col in GENES:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#     for col in CELLS:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#--------------------- Removing Skewness ---------------------
#     for col in GENES + CELLS:
#         if(abs(data[col].skew()) > 0.75):
            
#             if(data[col].skew() < 0): # neg-skewness
#                 data[col] = data[col].max() - data[col] + 1
#                 data[col] = np.sqrt(data[col])
            
#             else:
#                 data[col] = np.sqrt(data[col])
    
    return data

In [20]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

927

In [21]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 2
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=1024


# Single fold training

In [22]:
def run_training(fold, seed, adv = False):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    if adv:
        model = AdvModel(
            num_features=num_features,
            num_targets=num_targets,
        )
    else:
        model = Model(
            num_features=num_features,
            num_targets=num_targets,
            hidden_size=hidden_size,
        )

    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    loss_fn = nn.BCEWithLogitsLoss()
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, loss_fn, trainloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            if adv:
                torch.save(model.state_dict(), f"Adv_FOLD{fold}_.pth")
            else:
                torch.save(model.state_dict(), f"FOLD{fold}_.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    if adv:
        model = AdvModel(
            num_features=num_features,
            num_targets=num_targets,
        )
        model.load_state_dict(torch.load(f"Adv_FOLD{fold}_.pth"))
    else:
        model = Model(
            num_features=num_features,
            num_targets=num_targets,
            hidden_size=hidden_size,
        )
        model.load_state_dict(torch.load(f"FOLD{fold}_.pth"))

    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions


In [29]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed, adv = True)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [30]:
# Averaging on multiple SEEDS

SEED = [0, 1, 2, 3 ,4, 5]
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train[target_cols] = oof
test[target_cols] = predictions


TypeError: super(type, obj): obj must be an instance or subtype of type

In [64]:
# test['atp-sensitive_potassium_channel_antagonist'] = 0.0
# test['erbb2_inhibitor'] = 0.0

# train['atp-sensitive_potassium_channel_antagonist'] = 0.0
# train['erbb2_inhibitor'] = 0.0

In [65]:
train_targets_scored

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23810,id_fffb70c0c,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23811,id_fffc1c3f4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23812,id_fffcb9e7c,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
len(target_cols)


In [66]:
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)


y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.014652433808298056


In [67]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

In [117]:
# import torch
# import torch.nn as nn
# import torch.onnx as onnx

# # 定义模型
# class Model(nn.Module):
#     def __init__(self, num_features, num_targets, hidden_size):
#         super(Model, self).__init__()
#         self.batch_norm1 = nn.BatchNorm1d(num_features)
#         self.dropout1 = nn.Dropout(0.2)
#         self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

#         self.batch_norm2 = nn.BatchNorm1d(hidden_size)
#         self.dropout2 = nn.Dropout(0.5)
#         self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))

#         self.batch_norm3 = nn.BatchNorm1d(hidden_size)
#         self.dropout3 = nn.Dropout(0.5)
#         self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))

#     def forward(self, x):
#         x = self.batch_norm1(x)
#         x = self.dropout1(x)
#         x = torch.relu(self.dense1(x))

#         x = self.batch_norm2(x)
#         x = self.dropout2(x)
#         x = torch.relu(self.dense2(x))

#         x = self.batch_norm3(x)
#         x = self.dropout3(x)
#         x = self.dense3(x)

#         return x

# # 创建模型实例
# model = Model(num_features=num_features, num_targets=num_targets, hidden_size=hidden_size)

# # 创建一个示例输入
# x = torch.randn(21948, num_features)

# # 导出模型为ONNX格式
# torch.onnx.export(model,  # 导出的模型
#                   x,  # 输入数据
#                   "model.onnx",  # 导出的文件路径
#                   )  # 显示详细信息


verbose: False, log level: Level.ERROR



In [113]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.14.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: onnx
Successfully installed onnx-1.14.0


## Things that can improve your CV even further:
1. Increasing SEEDS
2. Feature Selection over GENES/CELLS columns
3. Model Hyperparameter Tuning
4. Removing Skewness from GENES/CELLS columns [Comment below if it helps]
5. PCA........................................[Comment below if it helps]
