# Apply CNN for timeseries data

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

## Read data

In [3]:
sleeper_type_dict = {
    1: 'pc',
    2: 'wooden',
    3: 'junction',
    4: 'short',
    5: 'synthetic',
    6: 'synth_junc',
    7: 'symth_short',
    8: 'other'
}

In [4]:
def read_track(line_name='a'):
    track_df = pd.read_csv(os.path.join(input_dir, 'track_fillna_{}.csv'.format(line_name.upper())))
    col_names_track = ['date', 'kilo', 'lev_l', 'lev_r', 'cur_l', 'cur_r', 'cant', 'width', 'speed']
    track_df.columns = col_names_track
    track_df['date'] = pd.to_datetime(track_df['date'])
    track_df['kilo'] = track_df['kilo'].astype('str')
    print('track_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=track_df.shape))
    return track_df

In [5]:
def read_equ(line_name='a'):
    equ_df = pd.read_csv(os.path.join(input_dir, 'equipment_{}.csv'.format(line_name.upper())))
    col_names_equ = ['kilo', 'is_ballast', 'is_long', 'sleeper_type', 'is_bridge', 'is_crossing', 'gross_ton', 'radius', 'is_unreliable']
    equ_df.columns = col_names_equ
    equ_df['kilo'] = equ_df['kilo'].astype('str')
    equ_df['sleeper_type'] = equ_df['sleeper_type'].replace(sleeper_type_dict).astype('category')
    print('equ_{line_name} shape: {shape}'.format(line_name=line_name.upper(), shape=equ_df.shape))
    return equ_df

In [6]:
# degrade data types to save memory
def degrade_dtypes(df):
    for col in df.columns:
        if df[col].dtype=='int64':
            df[col] = df[col].astype('int32')
        if df[col].dtype=='float64':
            df[col] = df[col].astype('float32')
    return df

In [7]:
abcd = 'b'

In [8]:
track = read_track(abcd)
track = degrade_dtypes(track)
equ = read_equ(abcd)
equ = degrade_dtypes(equ)

track_B shape: (7815753, 9)
equ_B shape: (21531, 9)


In [9]:
# sample_submit = pd.read_csv(os.path.join(input_dir, 'sample_submit.csv'), header=None)
index_master = pd.read_csv(os.path.join(input_dir, 'index_master.csv'))
index_master.columns = ['id', 'line_name', 'date', 'kilo']
index_master['date'] = pd.to_datetime(index_master['date'])
index_master['kilo'] = index_master['kilo'].astype('str')

## とりあえず、21×21で切り取る。一行分作成。
- やはり目的変数はdiffにすべき（補修の影響が入ってしまうから）
- 横サイズは30くらいでいいのでは？　縦はこれから考える。
- まず、lev_lで異常値を除き計測誤差を除外、lev_l_diffで異常値を除いて補修時のデータを除外
- 誤差の最小値と最大値を使って、値を0~1に基準化
- 欠損値は0埋め
- 目的変数が欠損値の場合は学習させない（予測もしない）

In [36]:
side_len = 35 #must be odd
side_len_half = int((side_len-1)/2)

In [37]:
track_pv = track.pivot(index='date', columns='kilo', values='lev_l')

In [38]:
# normalize to max==1 and min=0
track_pv = track_pv + 21
track_pv = track_pv / 50

In [39]:
track_pv.head()

kilo,10000,10001,10002,10003,10004,10005,10006,10007,10008,10009,...,31521,31522,31523,31524,31525,31526,31527,31528,31529,31530
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-03,0.4244,0.3682,0.3058,0.28,0.3182,0.4088,0.5046,0.5508,0.5186,0.4222,...,0.4532,0.4104,0.3662,0.3414,0.3466,0.376,0.4122,0.4378,0.4446,0.4382
2017-04-04,0.4244,0.3682,0.3058,0.28,0.3182,0.4088,0.5046,0.5508,0.5186,0.4222,...,0.4532,0.4104,0.3662,0.3414,0.3466,0.376,0.4122,0.4378,0.4446,0.4382
2017-04-05,0.412,0.354,0.2974,0.2834,0.3336,0.43,0.522,0.5566,0.5112,0.4066,...,0.4562,0.4174,0.3734,0.3446,0.3434,0.368,0.404,0.4338,0.4466,0.4432
2017-04-06,0.412,0.368,0.3058,0.2832,0.3276,0.4238,0.519,0.556,0.51,0.4034,...,0.4562,0.4174,0.3734,0.3446,0.3434,0.368,0.404,0.4338,0.4466,0.4432
2017-04-07,0.4208,0.3662,0.3082,0.2878,0.3296,0.4202,0.5122,0.5528,0.5156,0.4168,...,0.4562,0.4174,0.3734,0.3446,0.3434,0.368,0.404,0.4338,0.4466,0.4432


In [40]:
track_pv_tgt = track_pv.iloc[side_len:, side_len_half:-side_len_half]

In [41]:
div_data = np.empty((track_pv_tgt.shape[1], 1, side_len, side_len), dtype='float32')
row_num = 0
for col_num in range(track_pv_tgt.shape[1]):
    div_tmp = track_pv.values[row_num:row_num+side_len, col_num:col_num+side_len]
    div_data[col_num] = div_tmp.reshape(1, side_len, side_len)

In [42]:
div_label = track_pv_tgt.values[0,:]

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from sklearn import model_selection

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device", device)

torch.manual_seed(0)

batch_size = 100
num_classes = 1
epochs = 2

device cpu


In [45]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(div_data, div_label, test_size=1/7, random_state=0)

# x_train = x_train.astype('double')
# x_test = x_test.astype('double')
# y_train = y_train.astype('float')
# y_test = y_test.astype('float')

In [46]:
ds_train = data.TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
ds_test  = data.TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

In [47]:
dataloader_train = data.DataLoader(dataset=ds_train, batch_size=batch_size, shuffle=True)
dataloader_test = data.DataLoader(dataset=ds_test, batch_size=batch_size, shuffle=False)

In [55]:
class CNNModel (nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
 
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
 
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
 
        self.fc1 = nn.Linear(64*17*17, 128)
        self.fc2 = nn.Linear(128, num_classes)
 
 
    def forward(self, x):
#         x = x.double()
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, (2, 2))
        x = self.dropout1(x)
 
        x = x.view(-1, 64*17*17)
 
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)
        return F.relu(self.fc2(x))

In [56]:
model = CNNModel().to(device)

In [57]:
print(model)

CNNModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (dropout1): Dropout2d(p=0.25)
  (dropout2): Dropout2d(p=0.5)
  (fc1): Linear(in_features=18496, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [58]:
criterion = nn.L1Loss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [59]:
global_step = 0
 
def train(epoch):
    model.train()
    steps = len(ds_train)//batch_size
    for step, (images, labels) in enumerate(dataloader_train, 1):
        global global_step
        global_step += 1
        images, labels = images.to(device), labels.to(device)
 
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
 
        if step % 50 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch, epochs, step, steps, loss.item()))

In [60]:
def eval(epoch):
    model.eval()
    abs_err_sum = 0
    total = 0
    with torch.no_grad():
        for (images, labels) in dataloader_test:
            images, labels = images.to(device), labels.to(device)
 
            outputs = model(images)
            outputs = outputs.reshape(-1)
            abs_err_sum += abs(outputs - labels).sum()
            total += labels.size(0)
    print("Val Acc : %.4f" % (abs_err_sum/total))

In [61]:
for epoch in range(1, epochs+1):
    train(epoch)#, writer)
    eval(epoch)#, writer)

Epoch [1/2], Step [50/184], Loss: 0.0394
Epoch [1/2], Step [100/184], Loss: 0.0290
Epoch [1/2], Step [150/184], Loss: 0.0321
Val Acc : 0.0282
Epoch [2/2], Step [50/184], Loss: 0.0347
Epoch [2/2], Step [100/184], Loss: 0.0291
Epoch [2/2], Step [150/184], Loss: 0.0287
Val Acc : 0.0286
