In [None]:
#https://www.kaggle.com/tfukuda675/simple-lstm

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GroupKFold, KFold

In [None]:
test_df=pd.read_csv("/kaggle/input/ventilator-pressure-prediction/test.csv")
train_df=pd.read_csv("/kaggle/input/ventilator-pressure-prediction/train.csv")
train_df.tail()

1つ目の制御入力(u_in)は，0〜100の連続変数で，空気を肺に入れるために吸気電磁弁を開く割合を表します（すなわち，0は完全に閉じて空気を入れず，100は完全に開きます）。2つ目の制御入力(u_out)は、空気を出すための探索電磁弁が開いている（1）か閉じている（0）かを表す二値変数です。

R - 気道がどの程度制限されているかを示す肺の属性（単位：cmH2O/L/S）。物理的には、流量（時間当たりの空気量）の変化に対する圧力の変化です。直感的には、ストローで風船を膨らませるようなイメージです。ストローの直径を変えることでRを変化させることができ、Rが大きいほど吹きにくくなります。
C - 肺の適合性を示す肺属性（単位：mL/cmH2O）。物理的には、圧力の変化に対する体積の変化を表します。直感的には、同じ風船の例を想像してください。風船のラテックスの厚さを変えることでCを変化させることができ、Cが大きいほどラテックスが薄くて吹きやすいということになります。

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.displot(train_df["u_in"])
plt.xlim(0, 50)
plt.ylim(0,10000)

In [None]:
sns.displot(train_df["R"])

In [None]:
show=False

time_stepの直線性確認

In [None]:
if show:
    import plotly.graph_objects as go

    time_step_diff_limit=0.04
    non_linear_timestep_breath_ids=[]
    for k, grp in train_df.groupby("breath_id"):
        diff_se=grp["time_step"].diff()
        diff_chk=diff_se[diff_se>time_step_diff_limit]
        if len(diff_chk) !=0:
            non_linear_timestep_breath_ids.append(k)
        
    #直線でないtime_stepの可視化
    non_linear_timestep_df=train_df[train_df["breath_id"].isin(non_linear_timestep_breath_ids)]
    fig=go.Figure()
    for k, grp in non_linear_timestep_df.groupby("breath_id"):
        grp=grp.reset_index(drop=True)
        fig.add_trace(go.Scatter(x=grp.index, y=grp["time_step"], mode="lines", name=k))
    fig.show()
    
    
    sns.displot(train_df["C"])

データをきれいにしていくぜ！

In [None]:
def data_clean(df):
    ## timestepに直線性が無いデータを削除
    time_step_diff_limit = 0.04
    non_liner_timestep_breath_ids = list()
    for k, grp in df.groupby("breath_id"):
        diff_se = grp["time_step"].diff()
        diff_chk = diff_se[diff_se > time_step_diff_limit]
        if len(diff_chk) != 0:
            non_liner_timestep_breath_ids.append(k)
    df = df[~df["breath_id"].isin(non_liner_timestep_breath_ids)]
    
    ## 負のpressure値を持つデータを削除
    minus_pressure_breath_ids = list()
    for k, grp in df.groupby("breath_id"):
        m = grp["pressure"].min()
        if m < 0:
            minus_pressure_breath_ids.append(k)
    df = df[~df["breath_id"].isin(minus_pressure_breath_ids)]
    
    ## u_out = 1のstep数が52以上のデータを削除
    u_out_open_step_counts_over52_breath_ids = list()
    for k, grp in train.groupby("breath_id"):
        count = grp.groupby("u_out")["id"].count()[1]
        if count > 51:
            u_out_open_step_counts_over52_breath_ids.append(k)
    df = df[~df["breath_id"].isin(u_out_open_step_counts_over52_breath_ids)] 
    
    
    return df

In [None]:
def RC_type(df):
    df["R"]=df["R"].astype(str)
    df["C"]=df["C"].astype(str)
    return df

In [None]:
train=RC_type(train_df)
train=data_clean(train)
train.tail()

In [None]:
y=train["pressure"]
X=train.drop(["id","breath_id","pressure","u_out"], axis=1)
X

In [None]:
#testはdata_cleanしてはいけない
test=RC_type(test_df)
test=test.drop(["id","breath_id","u_out"], axis=1)
test

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
test = scaler.transform(test)

データ形状をtensorflow向けに変更

In [None]:
X=X.reshape(-1,80,X.shape[-1])
test=test.reshape(-1,80,X.shape[-1])

In [None]:
print("X shape :" +str(X.shape))
print("test shape :" +str(test.shape))

In [None]:
y=y.to_numpy().reshape(-1,80)

In [None]:
print("y shape :" +str(y.shape))

In [None]:
def create_model3():
    model=tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=X.shape[-2:]),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
        tf.keras.layers.Dense(20,activation="relu"),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1,activation="relu"))
    ])
    
    return model

In [None]:
fold=5
batch_size=1024
epochs=100
kf=KFold(n_splits=fold,shuffle=True, random_state=42)

test_preds=[]
for n_fold,(train_id, val_id) in enumerate(kf.split(X,y)):
    train_x, train_y=X[train_id], y[train_id]
    val_x, val_y=X[val_id],y[val_id]
    
    model=create_model3()
    model.compile(optimizer="adam", loss="mae")
    
    es = EarlyStopping(monitor="val_loss", patience=15, verbose=1, mode="min", restore_best_weights=True)

    history=model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=epochs, batch_size=batch_size,verbose=0,callbacks=[es])
    #model.save(f'model_save_fold{fold+1}')
    test_preds.append(model.predict(test).squeeze())

In [None]:
import matplotlib.pyplot as plt
#fig,ax=plt.subplots(1,1)
plt.plot(range(epochs),history.history['loss'],color='b',label="training loss")
plt.plot(range(epochs), history.history['val_loss'],color='r',label="validation loss")

legend=plt.legend(loc="best", shadow=True)

In [None]:
submission=pd.read_csv("/kaggle/input/ventilator-pressure-prediction/sample_submission.csv")

In [None]:
preds = np.sum(test_preds, axis=0)
print(preds.shape)

In [None]:
preds=preds.reshape(-1,1).squeeze()
print(preds.shape)

In [None]:
def create_model4():
    model=tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=X.shape[-2:]),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
        tf.keras.layers.Dense(50,activation="selu"),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(3,activation="relu")),
        tf.keras.layers.Dense(1,activation="relu")
    ])
    
    return model

In [None]:
fold=5
batch_size=1024
epochs=100
kf=KFold(n_splits=fold,shuffle=True, random_state=42)

test_preds1=[]
for n_fold,(train_id, val_id) in enumerate(kf.split(X,y)):
    train_x, train_y=X[train_id], y[train_id]
    val_x, val_y=X[val_id],y[val_id]
    
    model=create_model4()
    model.compile(optimizer="adam", loss="mae")
    
    es = EarlyStopping(monitor="val_loss", patience=15, verbose=1, mode="min", restore_best_weights=True)

    history1=model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=epochs, batch_size=batch_size,verbose=0,callbacks=[es])
    #model.save(f'model_save_fold{fold+1}')
    test_preds1.append(model.predict(test).squeeze())

In [None]:
plt.plot(range(epochs),history1.history['loss'],color='b',label="training loss")
plt.plot(range(epochs), history1.history['val_loss'],color='r',label="validation loss")

legend=plt.legend(loc="best", shadow=True)

In [None]:
preds1 = np.sum(test_preds1, axis=0)
print(preds1.shape)

In [None]:
preds1=preds1.reshape(-1,1).squeeze()
print(preds1.shape)

In [None]:
submission["pressure"]=(preds/fold)*0.55+(preds1/fold)*0.55

In [None]:
submission.to_csv("submission#4-2.csv", index=False)

In [None]:
submission.head()