In [100]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.optim import Adam
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import gym
import random
from time import sleep

%matplotlib inline

ngpu = 1
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
current_path = os.getcwd() # path of folder
today = datetime.today().strftime("%y%m%d")
print("Today: ", today, "\nCurrent path: ", current_path)

Today:  230310 
Current path:  c:\Joko\Project\Bigdata\lstm_pilot


In [17]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

seed = 28947
set_seed(seed)

"""
- Progress of code

1. prepare data for training
2. define lstm network architecture
3. specify training options
4. train neural network
5. test network
6. forcast future time step
   (1) open loop forecasting
   (2) close loop forecasting 

"""

In [23]:
"""
Pilot test 하려고 원래 데이터 source에서 data 폴더로 데이터 옮기는 코드 (랜덤하게 넣기위해)
"""
np.random.seed(seed) # 왠진 모르겠는데 seed가 안먹힘. 그냥하자 일단. 이거 할 때 여러번 누르면 데이터 갯수가 늘어나니깐 조심
source_lst = ["Train", "Valid", "Test"]
# source = source_lst[0]
for source in source_lst:
    source_path = current_path + '/source/' + source + '/'
    saved_path = current_path + '/data/' + source + '/'

    file_lst = os.listdir(source_path)
    sample_lst = random.sample(file_lst, int(len(file_lst)/10))

    for sample in sample_lst:
        csv = pd.read_csv(source_path + sample)
        df = pd.DataFrame(csv)
        df.to_csv(saved_path + sample)

현재 데이터셋
Train: 188개
Valid: 39개
Test: 40개

데이터는 하던 Delaware data에서 10%씩 빼온 거임.

In [None]:
gas_rate_lst = df['Gas_rate'].values
cumgas_rate_lst = []
for idx, gas_rate in enumerate(gas_rate_lst):
    if idx == 0:
        cumgas_rate_lst = np.append(cumgas_rate_lst, gas_rate)
    else:
        cumgas = cumgas_rate_lst[idx-1] + gas_rate
        cumgas_rate_lst = np.append(cumgas_rate_lst, cumgas)
df['Cumgas_rate'] = cumgas_rate_lst

In [97]:
"""
Data normalization part
Train data를 모두 불러와서 standard normalization을 수행
"""
train_path = current_path + '/data/Train/'
train_lst = os.listdir(train_path)
valid_path = current_path + '/data/Valid/'
valid_lst = os.listdir(valid_path)

input_features = ['Gas_rate', 'Cumgas_rate', 'TotalProdMonths', 'ShutinMonths', 'Refrac']
chcek_scaling_point = 3 # input_features 중에서 어디까지 normalization할 것인지 수기로 정해줘야함. (파이썬 인덱스 생각말고, 그냥 그 위치를 넣어주면 됨)

scaler = StandardScaler() ## ShutinMonths, Refrac은 Scaling 하지 않음. 이미 0 or 1인 데이터임. 나머지는 standard_normalization

df_tot_tmp = pd.DataFrame()
df_tot = pd.DataFrame(columns=input_features)
for file in train_lst:
    df_tmp = pd.read_csv(train_path + file)

    gas_rate_lst = df_tmp['Gas_rate'].values ############### 전처리를 잘못해서 cumgas rate 생성 코드가 중간에 들어가야함
    cumgas_rate_lst = []
    for idx, gas_rate in enumerate(gas_rate_lst):
        if idx == 0:
            cumgas_rate_lst = np.append(cumgas_rate_lst, gas_rate)
        else:
            cumgas = cumgas_rate_lst[idx-1] + gas_rate
            cumgas_rate_lst = np.append(cumgas_rate_lst, cumgas)
    df_tmp['Cumgas_rate'] = cumgas_rate_lst ############### 전처리를 잘못해서 cumgas rate 생성 코드가 중간에 들어가야함

    df_tot_tmp = pd.concat([df_tot_tmp, df_tmp])

df_tot['Gas_rate'] = df_tot_tmp['Gas_rate']
for x in input_features:
    df_tot[x] = df_tot_tmp[x] ## 이게 없으면 열 순서가 자동으로 바뀜

scaler.fit(df_tot.values[:, :chcek_scaling_point]) ############ 조금 많이 헷갈릴 수 있는데, scaling 자체를 안하는 애들이 있어서 했다가 더하는 식으로 데이터를 구성함

In [47]:
def sliding_windows_mutli_features(data, seq_length):
    x = []
    y = []
    for i in range((data.shape[0])-seq_length):
        _x = data[i:(i+seq_length), :]
        # _x_prod = data[i:(i+seq_length), 0:1] ## 3 columns for features
        # _x_oper = data[i+1:(i+seq_length+1), 1:]
        # _x = np.concatenate((_x_prod, _x_oper), axis=1) ## 이부분은 cascading 식으로 예측할 때, 시점이 서로 달라야할 때 필요
        _y = data[i+seq_length, 0] ## column 0 contains the labbel
        x.append(_x)
        y.append(_y)

    return np.array(x), np.array(y).reshape(-1, 1)

In [101]:
seq_length = 3

X_train = []
Y_train = []

# file = train_lst[0]
for file in train_lst:
    csv = pd.read_csv(train_path + file)
    df = pd.DataFrame(csv)

    gas_rate_lst = df['Gas_rate'].values ############### 전처리를 잘못해서 cumgas rate 생성 코드가 중간에 들어가야함
    cumgas_rate_lst = []
    for idx, gas_rate in enumerate(gas_rate_lst):
        if idx == 0:
            cumgas_rate_lst = np.append(cumgas_rate_lst, gas_rate)
        else:
            cumgas = cumgas_rate_lst[idx-1] + gas_rate
            cumgas_rate_lst = np.append(cumgas_rate_lst, cumgas)
    df['Cumgas_rate'] = cumgas_rate_lst ############### 전처리를 잘못해서 cumgas rate 생성 코드가 중간에 들어가야함
    data = df[input_features].values
    data_no_need_scaling = data[:, chcek_scaling_point:]
    data_need_scaling = scaler.transform(data[:, :chcek_scaling_point])
    data_scaled = np.concatenate((data_need_scaling, data_no_need_scaling), axis=1)
    x, y = sliding_windows_mutli_features(data_scaled, seq_length)

    X_train = np.concatenate((X_train, x), axis=None)
    Y_train = np.concatenate((Y_train, y), axis=None)

X_train = np.reshape(X_train, (-1, seq_length, len(input_features)))
Y_train = np.reshape(Y_train, (-1, 1))

################### Train과 똑같은 과정을 valid에서도, 단 scaling만 Train해서 한 값으로 함

X_valid = []
Y_valid = []

# file = valid_lst[0]
for file in valid_lst:
    csv = pd.read_csv(valid_path + file)
    df = pd.DataFrame(csv)

    gas_rate_lst = df['Gas_rate'].values ############### 전처리를 잘못해서 cumgas rate 생성 코드가 중간에 들어가야함
    cumgas_rate_lst = []
    for idx, gas_rate in enumerate(gas_rate_lst):
        if idx == 0:
            cumgas_rate_lst = np.append(cumgas_rate_lst, gas_rate)
        else:
            cumgas = cumgas_rate_lst[idx-1] + gas_rate
            cumgas_rate_lst = np.append(cumgas_rate_lst, cumgas)
    df['Cumgas_rate'] = cumgas_rate_lst ############### 전처리를 잘못해서 cumgas rate 생성 코드가 중간에 들어가야함
    data = df[input_features].values
    data_no_need_scaling = data[:, chcek_scaling_point:]
    data_need_scaling = scaler.transform(data[:, :chcek_scaling_point])
    data_scaled = np.concatenate((data_need_scaling, data_no_need_scaling), axis=1)
    x, y = sliding_windows_mutli_features(data_scaled, seq_length)

    X_valid = np.concatenate((X_valid, x), axis=None)
    Y_valid = np.concatenate((Y_valid, y), axis=None)

X_valid = np.reshape(X_valid, (-1, seq_length, len(input_features)))
Y_valid = np.reshape(Y_valid, (-1, 1))

validX = Variable(torch.Tensor(X_valid))
validY = Variable(torch.Tensor(Y_valid))

In [102]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x_data = x
        self.y_data = y

    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = Variable(torch.Tensor(self.x_data[idx]))
        y = Variable(torch.Tensor(self.y_data[idx]))
        return x, y

In [103]:
batch_size = 32

train_dataset = CustomDataset(X_train, Y_train)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)

In [None]:
"""
현재 데이터셋 구축까지 완료함.

이제 모델 구축하고, 학습한 다음에
Train / Valid 성능 확인하는 부분만 남음. 최종 수정일시 : 230310 오후 8시 08분 

"""