In [101]:
!pip install optuna
!pip install pandas
!pip install tqdm
!pip install -Uqq fastai
!pip install torchinfo
!pip install torchviz



In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from fastai.data.core import DataLoaders
from fastai.learner import Learner
from fastai.callback.progress import ProgressCallback
from fastai.optimizer import OptimWrapper
from torch import optim
from torchinfo import summary
from torchviz import make_dot
from fastai.losses import MSELossFlat, L1LossFlat
from fastai.callback.schedule import Learner
from fastai.callback.tracker import EarlyStoppingCallback, ReduceLROnPlateau
from fastai.data.transforms import IndexSplitter
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import KFold, GroupKFold
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import random
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.signal import hilbert, chirp

from IPython.display import display

In [2]:
gd_flag = False

In [27]:
from google.colab import drive
drive.mount('/content/drive/')
import sys
sys.path.append('/content/drive/My Drive/GBrain_Ventilator')
gd_flag = True

ModuleNotFoundError: No module named 'google.colab'

In [3]:
if gd_flag:
    path_to_data = '/content/drive/My Drive/GBrain_Ventilator/'    
else:  
    path_to_data = 'e:\\Krivenko\\Kaggle\\2021\\New20211005\\'

In [4]:
DEBUG = True
TRAIN_MODEL = True
INFER_TEST = False
ONE_FOLD_ONLY = True
COMPUTE_LSTM_IMPORTANCE = True
MODEL_SUMMARY = False
OOF = True

In [5]:
train_name = 'train.csv'
test_name = 'test.csv'
train = pd.read_csv(path_to_data + train_name)
test = pd.read_csv(path_to_data + test_name)

pressure_values = np.sort( train.pressure.unique() )
submission = pd.read_csv(path_to_data + 'sample_submission.csv')

if DEBUG:
    train = train[:80*1000]
    test = test[:80*1000]

In [6]:
train.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


In [7]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()  
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

train = add_features(train)
test = add_features(test)

print('Train dataframe shape',train.shape)
train.head()

Train dataframe shape (80000, 53)


Unnamed: 0,id,breath_id,time_step,u_in,u_out,pressure,area,u_in_cumsum,u_in_lag1,u_out_lag1,...,C_50,R__C_20__10,R__C_20__20,R__C_20__50,R__C_50__10,R__C_50__20,R__C_50__50,R__C_5__10,R__C_5__20,R__C_5__50
0,1,1,0.0,0.083334,0,5.837492,0.0,0.083334,0.0,0.0,...,1,0,0,1,0,0,0,0,0,0
1,2,1,0.033652,18.383041,0,5.907794,0.618632,18.466375,0.083334,0.0,...,1,0,0,1,0,0,0,0,0,0
2,3,1,0.067514,22.509278,0,7.876254,2.138333,40.975653,18.383041,0.0,...,1,0,0,1,0,0,0,0,0,0
3,4,1,0.101542,22.808822,0,11.742872,4.454391,63.784476,22.509278,0.0,...,1,0,0,1,0,0,0,0,0,0
4,5,1,0.135756,25.35585,0,12.234987,7.896588,89.140326,22.808822,0.0,...,1,0,0,1,0,0,0,0,0,0


In [8]:
targets = train[['pressure']].to_numpy().reshape(-1, 80)
#train.drop(['pressure','id', 'breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_lagsame','breath_id_lag2same','u_out_lag2'], axis=1, inplace=True)
#test = test.drop(['id', 'breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_lagsame','breath_id_lag2same','u_out_lag2'], axis=1)
train.drop(['pressure','id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

In [9]:
targets.shape

(1000, 80)

In [10]:
COLS = list(train.columns)
print('Number of feature columns =', len(COLS) )

RS = RobustScaler()
train = RS.fit_transform(train)
test = RS.transform(test)

train = train.reshape(-1, 80, train.shape[-1])
test = test.reshape(-1, 80, train.shape[-1])

Number of feature columns = 50


In [11]:
idx = list(range(len(train)))

In [12]:
train.shape[0]

1000

In [13]:
class VentilatorDataset(Dataset):
    def __init__(self, data, target):
        self.data = torch.from_numpy(data).float()
        if target is not None:
            self.targets = torch.from_numpy(target).float()
                
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if hasattr(self, 'targets'): return self.data[idx], self.targets[idx]
        else: return self.data[idx]

In [14]:
class RNNModel(nn.Module):
    def __init__(self, input_size=train.shape[-1]):
        hidden = [400, 300, 200, 100]
        super().__init__()
        self.lstm1 = nn.LSTM(input_size, hidden[0],
                             batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(2 * hidden[0], hidden[1],
                             batch_first=True, bidirectional=True)
        self.lstm3 = nn.LSTM(2 * hidden[1], hidden[2],
                             batch_first=True, bidirectional=True)
        self.lstm4 = nn.LSTM(2 * hidden[2], hidden[3],
                             batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(2 * hidden[3], 50)
        self.selu = nn.SELU()
        self.fc2 = nn.Linear(50, 1)
        self._reinitialize()

    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'lstm' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
            elif 'fc' in name:
                if 'weight' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'bias' in name:
                    p.data.fill_(0)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x, _ = self.lstm4(x)
        x = self.fc1(x)
        x = self.selu(x)
        x = self.fc2(x)

        return x

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

cuda:0


In [None]:
EPOCH = 100
BATCH_SIZE = 128
NUM_FOLDS = 10


if 1:
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)    
    test_preds = []
    oof_preds = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
        
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
        X_train, X_valid = train[train_idx], train[test_idx]
        y_train, y_valid = targets[train_idx], targets[test_idx]
        
        checkpoint_filepath = path_to_data + f"folds{fold}.hdf5"
        
        if TRAIN_MODEL:
            train_dataset = VentilatorDataset(X_train, y_train)
            valid_dataset = VentilatorDataset(X_valid, y_valid)
            
            train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
            valid_loader = DataLoader(valid_dataset, batch_size = BATCH_SIZE, shuffle=False)

            dls = DataLoaders(train_loader, valid_loader)
            model = RNNModel()
            
            learn = Learner(dls, model, loss_func=L1LossFlat())
            lrs  = learn.lr_find()
                        
            learn.fit_one_cycle(EPOCH, lr_max=lrs.valley, cbs=ReduceLROnPlateau(monitor='valid_loss', min_delta=0.5, patience=10))
            
            torch.save(model.state_dict(), checkpoint_filepath)
            
        else:                                   
            model = RNNModel()
            model.load_state_dict(torch.load(checkpoint_filepath))
            model = model.to(device)
        
        if MODEL_SUMMARY:
            print(summary(model, input_size=(train.shape[0], train.shape[1], train.shape[2])))
            #plot_model(model, to_file='Google_Brain_Keras_Model_vpp.png', show_shapes=True, show_layer_names=True)
            
    
        if OOF:
            print(' Predicting OOF data...')            
            oof_dataset = VentilatorDataset(X_valid, None)
            oof_loader = DataLoader(oof_dataset, batch_size = BATCH_SIZE, shuffle=False)
            preds = []
            with torch.no_grad():
                for data in oof_loader:                    
                    #pred = model(data.to('cpu')).squeeze(-1).flatten()
                    pred = model(data.to(device)).squeeze(-1)
                    preds.extend(pred.detach().cpu().numpy())
            oof = preds            
            baseline_mae = np.mean(np.abs( oof-y_valid ))
            oof_preds.append(baseline_mae)
            print('OOF MAE = {0}'.format(baseline_mae))
            print(' Done!')
                       
        if INFER_TEST:
            print(' Predicting test data...')
            test_dataset = VentilatorDataset(test, None)
            test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle=False)
            preds = []
            with torch.no_grad():
                for data in test_loader:
                    pred = model(data.to(device)).squeeze(-1).flatten()
                    preds.extend(pred.detach().cpu().numpy())            
            test_preds.append(np.array(preds))
            print(' Done!')
                    
        if COMPUTE_LSTM_IMPORTANCE:
            results = []
            print(' Computing LSTM feature importance...')
            
            # COMPUTE BASELINE (NO SHUFFLE)
            oof_dataset = VentilatorDataset(X_valid, None)
            oof_loader = DataLoader(oof_dataset, batch_size = BATCH_SIZE, shuffle=False)
            preds = []
            with torch.no_grad():
                for data in oof_loader:                    
                    #pred = model(data.to('cpu')).squeeze(-1).flatten()
                    pred = model(data.to(device)).squeeze(-1)                    
                    preds.extend(pred.detach().cpu().numpy())
            oof = preds
            baseline_mae = np.mean(np.abs( oof-y_valid ))            
            results.append({'feature':'BASELINE','mae':baseline_mae})
                        
            for k in tqdm(range(len(COLS))):
                
                # SHUFFLE FEATURE K
                save_col = X_valid[:,:,k].copy()
                np.random.shuffle(X_valid[:,:,k])
                
                # COMPUTE OOF MAE WITH FEATURE K SHUFFLED
                oof_dataset = VentilatorDataset(X_valid, None)
                oof_loader = DataLoader(oof_dataset, batch_size = BATCH_SIZE, shuffle=False)
                preds = []
                with torch.no_grad():
                    for data in oof_loader:                    
                        #pred = model(data.to('cpu')).squeeze(-1).flatten()
                        pred = model(data.to(device)).squeeze(-1)                        
                        preds.extend(pred.detach().cpu().numpy())
                oof = preds
                mae = np.mean(np.abs( oof-y_valid ))            
                results.append({'feature':COLS[k],'mae':mae})                               

                X_valid[:,:,k] = save_col
         
            # DISPLAY LSTM FEATURE IMPORTANCE
            print()
            df = pd.DataFrame(results)
            df = df.sort_values('mae')
            plt.figure(figsize=(10,20))
            plt.barh(np.arange(len(COLS)+1),df.mae)
            plt.yticks(np.arange(len(COLS)+1),df.feature.values)
            plt.title('LSTM Feature Importance',size=16)
            plt.ylim((-1,len(COLS)+1))
            plt.plot([baseline_mae,baseline_mae],[-1,len(COLS)+1], '--', color='orange',
                     label=f'Baseline OOF\nMAE={baseline_mae:.3f}')
            plt.xlabel(f'Fold {fold+1} OOF MAE with feature permuted',size=14)
            plt.ylabel('Feature',size=14)
            plt.legend()
            plt.show()
                               
            # SAVE LSTM FEATURE IMPORTANCE
            df = df.sort_values('mae',ascending=False)
            df.to_csv(f'lstm_feature_importance_fold_{fold+1}.csv',index=False)
                               
        # ONLY DO ONE FOLD
        if ONE_FOLD_ONLY: break

--------------- > Fold 1 < ---------------


  ax.plot(val, idx, 'ro', label=nm, c=color)


epoch,train_loss,valid_loss,time
0,10.113691,8.706188,00:03
1,8.432611,5.488194,00:03
2,7.344796,5.247456,00:03
3,6.60868,4.684301,00:03
4,5.989854,4.12608,00:03
5,5.482968,3.567059,00:03
6,5.05019,3.276822,00:03
7,4.64152,2.789437,00:03
8,4.21526,2.492665,00:03
9,3.873913,2.19683,00:03


Epoch 26: reducing lr to 0.0002086075961529598
Epoch 46: reducing lr to 0.00016805937110681384


In [120]:
if INFER_TEST:
    PRESSURE_MIN = pressure_values[0]
    PRESSURE_MAX = pressure_values[-1]
    PRESSURE_STEP = pressure_values[1] - pressure_values[0]

    # NAME POSTFIX
    postfix = ''
    if ONE_FOLD_ONLY: 
        NUM_FOLDS = 1
        postfix = '_fold_1'
        
    # ENSEMBLE FOLDS WITH MEAN
    submission["pressure"] = sum(test_preds)/NUM_FOLDS
    submission.to_csv(path_to_data + f'submission_mean{postfix}.csv', index=False)

    # ENSEMBLE FOLDS WITH MEDIAN
    submission["pressure"] = np.median(np.vstack(test_preds),axis=0)
    submission.to_csv(path_to_data + f'submission_median{postfix}.csv', index=False)

    # ENSEMBLE FOLDS WITH MEDIAN AND ROUND PREDICTIONS
    submission["pressure"] =\
        np.round( (submission.pressure - PRESSURE_MIN)/PRESSURE_STEP ) * PRESSURE_STEP + PRESSURE_MIN
    submission.pressure = np.clip(submission.pressure, PRESSURE_MIN, PRESSURE_MAX)
    submission.to_csv(path_to_data + f'submission_median_round{postfix}.csv', index=False)
    
    # DISPLAY SUBMISSION.CSV
    print(f'__submission{postfix}.csv head')
    display( submission.head() )

__submission_fold_1.csv head


Unnamed: 0,id,pressure
0,1,5.345377
1,2,6.189002
2,3,7.10293
3,4,8.08716
4,5,9.001088
