# Apply CNN for timeseries data

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
! mkdir input
! mkdir script
% cd /content/script

mkdir: cannot create directory ‘input’: File exists
mkdir: cannot create directory ‘script’: File exists
/content/script


In [0]:
# ブラウザでフォルダを表示するとURLのfolders以降がフォルダのIDになっている
# https://drive.google.com/drive/folders/<ID>
dir_id = "14lAz7cXcGKSDLezU0B6dS0xp3mBqQYC9"

# フォルダ内にあるファイル名とファイルIDを表示する
file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % dir_id}).GetList()
for f in file_list:
  print("name: " + f["title"] + ", id: " + f["id"])
  file_id = f["id"]
  drive_file = drive.CreateFile({'id': file_id})
  drive_file.GetContentFile("../input/" + f["title"])

name: track_D.csv, id: 1F1jshJfLG40Q4eePNwf256sadf9AeNpB
name: track_C.csv, id: 1tEADCCm8Mf520KKl0rfl8SCX-Hc49PhE
name: track_B.csv, id: 1TupxEDBcEU_Ox7Fbfiv5OX8kNJgMnZnH
name: track_A.csv, id: 1VUUPu7oc0IyBoLpt-rp-D9huAOnazb70
name: sample_submit.csv, id: 1XrqyvLtPDNzKa-pjRTfY6AIb4L1Eh5xw
name: index_master.csv, id: 1QALjnJHMT1SPgOSG299haDDWOe-Rf7qw
name: equipment_C.csv, id: 1KGmmL0psfK4yNrBTSwMQzztLt7OGbyDr
name: equipment_D.csv, id: 1ZpBD6O8XLQCjmt42EMU-Ys_1pBaLjWlc
name: equipment_B.csv, id: 198fkTzaulDBjh1u5Qc8gdRxe7Q6c6Azn
name: equipment_A.csv, id: 158PqPn45vuA1ZMCXx4GjNONh_dTfxahi


In [0]:
# # 上記の方法で調べたファイルのID
# file_id = "1F1jshJfLG40Q4eePNwf256sadf9AeNpB"
# drive_file = drive.CreateFile({'id': file_id})

# # ファイルの取得
# drive_file.GetContentFile("track_D.csv")

In [0]:
!pip3 install torch torchvision



In [0]:
import os
import pickle
import numpy as np
import pandas as pd

In [0]:
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

## Read data

In [0]:
sleeper_type_dict = {
    1: 'pc',
    2: 'wooden',
    3: 'junction',
    4: 'short',
    5: 'synthetic',
    6: 'synth_junc',
    7: 'symth_short',
    8: 'other'
}

In [0]:
line_name = 'd'
track_df = pd.read_csv('track_{}.csv'.format(line_name.upper()))
col_names_track = ['date', 'kilo', 'lev_l', 'lev_r', 'cur_l', 'cur_r', 'cant', 'width', 'speed']
track_df.columns = col_names_track
track_df['date'] = pd.to_datetime(track_df['date'])
track_df['kilo'] = track_df['kilo'].astype('str')
print('track_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=track_df.shape))
track = track_df

track_D shape: (5601687, 9)


In [0]:
def read_track(line_name='a'):
    track_df = pd.read_csv(os.path.join(input_dir, 'track_{}.csv'.format(line_name.upper())))
    col_names_track = ['date', 'kilo', 'lev_l', 'lev_r', 'cur_l', 'cur_r', 'cant', 'width', 'speed']
    track_df.columns = col_names_track
    track_df['date'] = pd.to_datetime(track_df['date'])
    track_df['kilo'] = track_df['kilo'].astype('str')
    print('track_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=track_df.shape))
    return track_df

In [0]:
def read_equ(line_name='a'):
    equ_df = pd.read_csv(os.path.join(input_dir, 'equipment_{}.csv'.format(line_name.upper())))
    col_names_equ = ['kilo', 'is_ballast', 'is_long', 'sleeper_type', 'is_bridge', 'is_crossing', 'gross_ton', 'radius', 'is_unreliable']
    equ_df.columns = col_names_equ
    equ_df['kilo'] = equ_df['kilo'].astype('str')
    equ_df['sleeper_type'] = equ_df['sleeper_type'].replace(sleeper_type_dict).astype('category')
    print('equ_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=equ_df.shape))
    return equ_df

In [0]:
# degrade data types to save memory
def degrade_dtypes(df):
    for col in df.columns:
        if df[col].dtype=='int64':
            df[col] = df[col].astype('int32')
        if df[col].dtype=='float64':
            df[col] = df[col].astype('float32')
    return df

In [0]:
abcd = 'd'

In [0]:
track = read_track(abcd)
track = degrade_dtypes(track)
equ = read_equ(abcd)
equ = degrade_dtypes(equ)

track_D shape: (5601687, 9)
equ_D shape: (15691, 9)


In [0]:
# sample_submit = pd.read_csv(os.path.join(input_dir, 'sample_submit.csv'), header=None)
index_master = pd.read_csv(os.path.join(input_dir, 'index_master.csv'))
index_master.columns = ['id', 'line_name', 'date', 'kilo']
index_master['date'] = pd.to_datetime(index_master['date'])
index_master['kilo'] = index_master['kilo'].astype('str')

## とりあえず、21×21で切り取る。一行分作成。
- やはり目的変数はdiffにすべき（補修の影響が入ってしまうから）
- 横サイズは30くらいでいいのでは？　縦はこれから考える。
- まず、lev_lで異常値を除き計測誤差を除外、lev_l_diffで異常値を除いて補修時のデータを除外
- 誤差の最小値と最大値を使って、値を0~1に基準化
- 済）欠損値は0埋め
- 済）目的変数が欠損値の場合は学習させない（予測もしない）

In [0]:
track = track.query('date < "2017-05-01"')

In [0]:
def isnot_outlier(track_s, threshold=2.57):
  track_s_stndrzd = (track_s - track_s.mean()) / track_s.std()
  isnot_outlier_s = track_s_stndrzd.abs() < threshold
  return isnot_outlier_s

In [0]:
# 1. drop outliers in lev_l_diff
track_pv = track.pivot(index='date', columns='kilo', values='lev_l')
track_pv_diff = track_pv.diff()
track_diff = pd.melt(track_pv_diff.reset_index(), id_vars='date', value_name='lev_l_diff')
track_diff['isnot_outlier_diff'] = isnot_outlier(track_diff['lev_l_diff'])
track = track.merge(track_diff, how='left', on=['date', 'kilo'])
track['lev_l'] *= track['isnot_outlier_diff']
track['lev_l'] = track['lev_l'].replace(0, np.nan)
track['lev_l_diff'] *= track['isnot_outlier_diff']
track['lev_l_diff'] = track['lev_l_diff'].replace(0, np.nan)


# 2. drop outliers in lev_l
track['isnot_outlier'] = isnot_outlier(track['lev_l'])
track['lev_l'] = track['lev_l'] * track['isnot_outlier']
track['lev_l'] = track['lev_l'].replace(0.0, np.nan)
track['lev_l_diff'] = track['lev_l_diff'] * track['isnot_outlier']
track['lev_l_diff'] = track['lev_l_diff'].replace(0.0, np.nan)

In [0]:
# normalize to max==1 and min=0
track_pv_diff = track.pivot(index='date', columns='kilo', values='lev_l_diff')
track_pv_diff = (track_pv_diff - track['lev_l_diff'].min()) / (track['lev_l_diff'].max() - track['lev_l_diff'].min())

In [0]:
side_wid = 7
side_len = 21 #must be odd
if side_len%2==0:
  raise NameError('side_len must be odd.')
side_len_half = int((side_len-1)/2)

In [0]:
track_pv_diff_tgt = track_pv_diff.iloc[side_wid:, side_len_half:-side_len_half]

In [0]:
div_data = np.empty((track_pv_diff_tgt.size, 1, side_wid, side_len), dtype='float32')
div_label = np.empty((track_pv_diff_tgt.size), dtype='float32')
for row_num in range(track_pv_diff_tgt.shape[0]):
    for col_num in range(track_pv_diff_tgt.shape[1]):
        div_tmp = track_pv_diff.values[row_num:row_num+side_wid, col_num:col_num+side_len]
        div_data[row_num*track_pv_diff_tgt.shape[1]+col_num] = div_tmp.reshape(1, side_wid, side_len)
        div_label[row_num*track_pv_diff_tgt.shape[1]+col_num] = track_pv_diff_tgt.iloc[row_num, col_num]

In [0]:
# process NA
# data: fill na by 0
div_data[np.isnan(div_data)] = 0
# data-label set: if the label is na, drop the set.
div_data = div_data[~np.isnan(div_label)]
div_label = div_label[~np.isnan(div_label)]

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from sklearn import model_selection

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device", device)

torch.manual_seed(0)

batch_size = 100
num_classes = 1
epochs = 4

device cuda


In [0]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(div_data, div_label, test_size=1/7, random_state=0)

ds_train = data.TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
ds_test  = data.TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

In [0]:
dataloader_train = data.DataLoader(dataset=ds_train, batch_size=batch_size, shuffle=True)
dataloader_test = data.DataLoader(dataset=ds_test, batch_size=batch_size, shuffle=False)

In [0]:
class CNNModel (nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
 
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
 
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
 
        self.fc1 = nn.Linear(64*3*10, 128)
        self.fc2 = nn.Linear(128, num_classes)
 
 
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, (2, 2))
        x = self.dropout1(x)
        
        x = x.view(-1, 64*3*10)

        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        return F.relu(self.fc2(x))

In [0]:
model = CNNModel().to(device)

In [0]:
print(model)

CNNModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (dropout1): Dropout2d(p=0.25)
  (dropout2): Dropout2d(p=0.5)
  (fc1): Linear(in_features=1920, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [0]:
criterion = nn.L1Loss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [0]:
global_step = 0
 
def train(epoch):
    model.train()
    steps = len(ds_train)//batch_size
    for step, (images, labels) in enumerate(dataloader_train, 1):
        global global_step
        global_step += 1
        images, labels = images.to(device), labels.to(device)
 
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
 
    if epoch % 2 == 0:
        print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch, epochs, step, steps, loss.item()))

In [0]:
def eval(epoch):
    model.eval()
    abs_err_sum = 0
    total = 0
    with torch.no_grad():
        for (images, labels) in dataloader_test:
            images, labels = images.to(device), labels.to(device)
 
            outputs = model(images)
            outputs = outputs.reshape(-1)
            abs_err_sum += abs(outputs - labels).sum()
            total += labels.size(0)
    print("Val Acc : %.4f" % (abs_err_sum/total))

In [0]:
for epoch in range(1, epochs+1):
    train(epoch)#, writer)
    eval(epoch)#, writer)

Val Acc : 0.0780
Epoch [2/4], Step [19484/19483], Loss: 0.0807
Val Acc : 0.0780
Val Acc : 0.0780
Epoch [4/4], Step [19484/19483], Loss: 0.0881
Val Acc : 0.0780
