In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e12/sample_submission.csv
/kaggle/input/playground-series-s3e12/train.csv
/kaggle/input/playground-series-s3e12/test.csv


## NN using torch

- PB is 0.89333 from this code.
- I hope this code helps someone...!

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s3e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e12/test.csv')

In [3]:
train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [4]:
test.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc
0,414,1.017,5.24,345,11.5,152,1.16
1,415,1.02,5.68,874,29.0,385,3.46
2,416,1.024,5.36,698,19.5,354,13.0
3,417,1.02,5.33,668,25.3,252,3.46
4,418,1.011,5.87,567,29.0,457,2.36


### I made a NN Model using torch.

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset

In [6]:
features = train.columns[1:-1].tolist()

### I only add one feature "coefficient of variation".

- It is used to compare data with different units of measurement.
- In other words, it is not enough to calculate the spread such as range or variance, so the relative spread should be compared.
- The larger the value of the coefficient of variation, the larger the relative difference.
- formula : std / mean

In [7]:
train['feature_cv'] = train[features].std(axis = 1) / train[features].mean(axis = 1)
test['feature_cv'] = test[features].std(axis = 1) / test[features].mean(axis = 1)

In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

### Then, I used MinMaxScaler for scaling data.

In [9]:
mms = MinMaxScaler()

### This task is classification, so I used StratifiedKFold.

In [10]:
skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

In [11]:
X = train.drop(['id', 'target'], axis = 1)
target = test[X.columns]

In [12]:
X = mms.fit_transform(X)
y = train['target'].values

target = mms.transform(target)

In [13]:
class CustomData(Dataset) :
    
    def __init__(self, X, Y = None, is_test = None) :
        
        if is_test == False :
            self.features = torch.tensor(X, dtype = torch.float)
            self.target = torch.tensor(Y, dtype = torch.float)
        else :
            self.features = torch.tensor(X, dtype = torch.float)
            self.target = torch.zeros(X.shape[0], dtype = torch.float)
            
    def __len__(self) :
        
        return len(self.features)
    
    def __getitem__(self, idx) :
        
        features = self.features[idx]
        target = self.target[idx]
        
        return features, target

## Network

In [14]:
class NN(nn.Module) :
    
    def __init__(self) :
        
        super(NN, self).__init__()
        self.classifier = nn.Sequential(
                                        nn.Linear(7, 16),
                                        nn.LeakyReLU(),
                                        nn.Linear(16, 8),
                                        nn.LeakyReLU(),
                                        nn.Linear(8, 1),
                                        nn.Sigmoid()
                                        )
        
    def forward(self, x) :
        
        x = self.classifier(x)
        
        return x

In [15]:
def train_validation(model, optimizer, criterion, num_epochs, tr_dl, val_dl) :
    
    for epoch in range(num_epochs) :
        
        model.train()

        for xx, yy in tr_dl :

            optimizer.zero_grad()

            pred = model(xx).reshape(16)

            loss = criterion(pred, yy)
            loss.backward()

            optimizer.step()

        model.eval()
        val_prob = []
        y_actuals = []

        with torch.no_grad() :

            for xx, yy in val_dl :

                pred = model(xx).reshape(len(xx))
                y_actuals += yy
                val_prob += pred
                loss = criterion(pred, yy)

        y_actuals = [v.item() for v in y_actuals]
        val_prob = [p.item() for p in val_prob]
        
        fpr, tpr, threshold = roc_curve(y_actuals, val_prob)
        epoch_auc = auc(fpr, tpr)
        print(f'{epoch + 1} Epoch val auc : {epoch_auc}')
        
    return model

In [16]:
def get_test_prediction(te_dl, model, n) :
    
    test_prediction = []
    model.eval()
    with torch.no_grad() :
        
        for xx, yy in te_dl :

            pred = model(xx)
            test_prediction += pred
            
    test_prediction = torch.tensor([p.item() / n for p in test_prediction]).reshape(276, 1)
    
    return test_prediction

In [17]:
nn_pred = torch.zeros(test.shape[0], 1)

In [18]:
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X[tr_idx], y[tr_idx]
    val_x, val_y = X[val_idx], y[val_idx]
    
    tr_data = CustomData(tr_x, tr_y, is_test = False)
    val_data = CustomData(val_x, val_y, is_test = False)
    te_data = CustomData(target, is_test = True)
    
    tr_dl = DataLoader(tr_data, batch_size = 16, shuffle = True, drop_last = True)
    val_dl = DataLoader(val_data, batch_size = 16, shuffle = False, drop_last = False)
    te_dl = DataLoader(te_data, batch_size = 16, shuffle = False, drop_last = False)
    
    model = NN()
    optimizer = optim.Adam(model.parameters(), lr = 0.0003)
    criterion = nn.BCELoss()
    
    print(f'{i + 1} Fold...!')
    best_model = train_validation(model, optimizer, criterion, 10, tr_dl, val_dl)
    nn_pred += get_test_prediction(te_dl, best_model, skf.n_splits)
    print('\n')

1 Fold...!
1 Epoch val auc : 0.7555816686251469
2 Epoch val auc : 0.7485311398354877
3 Epoch val auc : 0.7420681551116334
4 Epoch val auc : 0.7367802585193889
5 Epoch val auc : 0.7332549941245593
6 Epoch val auc : 0.7320799059929495
7 Epoch val auc : 0.7291421856639247
8 Epoch val auc : 0.7220916568742656
9 Epoch val auc : 0.7232667450058754
10 Epoch val auc : 0.7220916568742656


2 Fold...!
1 Epoch val auc : 0.5487661574618097
2 Epoch val auc : 0.6609870740305523
3 Epoch val auc : 0.7379553466509988
4 Epoch val auc : 0.7608695652173912
5 Epoch val auc : 0.7749706227967099
6 Epoch val auc : 0.7679200940070506
7 Epoch val auc : 0.7702702702702704
8 Epoch val auc : 0.7720329024676851
9 Epoch val auc : 0.7720329024676851
10 Epoch val auc : 0.7696827262044653


3 Fold...!
1 Epoch val auc : 0.5299647473560517
2 Epoch val auc : 0.5564042303172738
3 Epoch val auc : 0.5669800235017626
4 Epoch val auc : 0.582256169212691
5 Epoch val auc : 0.6063454759106933
6 Epoch val auc : 0.6263219741480611


In [19]:
submission = pd.read_csv('/kaggle/input/playground-series-s3e12/sample_submission.csv')

In [20]:
submission['target'] = nn_pred

In [21]:
submission

Unnamed: 0,id,target
0,414,0.457917
1,415,0.465908
2,416,0.472823
3,417,0.465108
4,418,0.460261
...,...,...
271,685,0.469608
272,686,0.458619
273,687,0.468014
274,688,0.461803
