In [1]:
import os
import signal
import itertools
from tqdm.notebook import tqdm

import gym
import numpy as np
np.set_printoptions(precision=5)
import pandas as pd
import math

from module.simulator import Simulator

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

torch.manual_seed(77)

<torch._C.Generator at 0x7f63c051d4f0>

In [2]:
### Hyper parameters ###
clip_range = 0.2
gamma = 1.0
lam = 1.0
learning_rate = 0.0005

HORIZON = 24 * 7 * 14
train_iter = 3

save_interval = 100

# True -> train
# False -> inference
is_train = True

In [3]:
class myEnv(gym.Env):
    def __init__(self, is_train):
        self.is_train = is_train
        self.simulator = Simulator()

        self.submission_ini = pd.read_csv("data/sample_submission.csv")
        self.submission_ini['time'] = pd.to_datetime(self.submission_ini['time'])
        self.submission_ini[['Event_A', 'Event_B']] = 'STOP'
        self.submission_ini[['MOL_A', 'MOL_B']] = 0
        
        over_prod, score = self.simulator.initialize_over_prod()
        over_prod[over_prod > 0] = 0
        shortage_ini = -over_prod
        for i in range(4):
            temp_arr = shortage_ini[:, i].copy()
            temp_idx = np.where(temp_arr > 0)[0]
            temp_values = temp_arr[temp_idx]
            temp_values[1:] -= temp_values[:-1].copy()
            shortage_ini[temp_idx, i] = temp_values
        self.shortage_ini = np.append(shortage_ini, np.zeros((31, 4)), axis=0)
        
        self.best_mean_score = 0
        self.best_score = 0
        self.mean_score = []

        
    def reset(self):
        self.submission = self.submission_ini.copy()
        self.shortage = self.shortage_ini.copy()
        self.step_count = 0
        
        self.mask = {'A': np.zeros([5]), 'B': np.zeros([5])}
        self.event_map = {0:'CHECK_1', 1:'CHECK_2', 2:'CHECK_3', 3:'CHECK_4', 4:'PROCESS'}
        self.check_time = {'A': 28, 'B': 28}
        self.process = {'A': 0, 'B': 0}
        self.process_mode = {'A': 0, 'B': 0}
        self.process_time = {'A': 0, 'B': 0}
        
        s = 0
        state_ini = np.concatenate((np.array(list(self.process_time.values())), 
                                    self.shortage[s//24+3:(s//24+31)].reshape(-1)))
        
        return state_ini
    
    
    def update_mask(self):
        for line in ['A', 'B']:
            self.mask[line][:] = 0
            if self.process[line] == 0:
                if self.check_time[line] == 28:
                    self.mask[line][:4] = 1
                if self.check_time[line] < 28:
                    self.mask[line][self.process_mode[line]] = 1
            if self.process[line] == 1:
                self.mask[line][4] = 1
                if self.process_time[line] > 98:
                    self.mask[line][:4] = 1
                    
                    
    def save_csv(self, submission, score):
        # PRT 투입
        for line in ['A', 'B']:
            state = submission['Event_'+line].str[-1].replace('S', np.nan).replace('P', np.nan).fillna(method='ffill')
            dates = (submission['time'] - np.timedelta64(23, 'D')).dt.date
            mol_input = (submission.groupby([state, dates]).sum()['MOL_'+line])
            for item in state.unique():
                PRT_delta = mol_input.loc[item]
                PRT_delta = PRT_delta.loc[(PRT_delta.index >= submission.iloc[0, 0]) & (PRT_delta.index <= submission.iloc[-1, 0])]
                submission.loc[submission['time'].isin(PRT_delta.index), 'PRT_'+item] += (PRT_delta.values * 1.2).astype(int)
        submission.to_csv(f"save_score_{score:.2f}.csv", index=False)
        
        
    def step(self, action_A, action_B, n_epoch):
        event_A = self.event_map[action_A]
        event_B = self.event_map[action_B]
        
        self.submission['Event_A'].iloc[self.step_count] = event_A
        self.submission['Event_B'].iloc[self.step_count] = event_B
        
        # update mask
        for event, line in zip([event_A, event_B], ['A', 'B']):
            if 'CHECK' in event:
                if self.process[line] == 1:
                    self.process[line] = 0
                    self.check_time[line] = 28
                self.check_time[line] -= 1
                self.process_mode[line] = int(event[-1]) - 1
                if self.check_time[line] == 0:
                    self.process[line] = 1
                    self.process_time[line] = 0
            elif event == 'PROCESS':
                self.process_time[line] += 1
                if self.process_time[line] == 140:
                    self.process[line] = 0
                    self.check_time[line] = 28
        
        info = {'saved': False, 'score': 0}
        self.step_count += 1
        if self.step_count == self.submission.shape[0]:
            done = True
            self.submission, _, score = self.simulator.get_score(self.submission)
            reward = score - self.best_mean_score
            if reward > 0:
                reward = 2
            elif reward > -1:
                reward = 1
            else:
                reward = 0
            
            if (score > 62) and (score > self.best_score):
                self.save_csv(self.submission, score)
                print('result saved (score : %s)' % score)
                self.best_score = score
                info = {'saved': True, 'score': score}
                
            self.mean_score.append(score)
            if (n_epoch+1) % 5 == 0:
                mean_score = sum(self.mean_score)/5
                if mean_score > self.best_mean_score:
                    self.best_mean_score = mean_score
                print('Epoch %s~%s mean score : %s / best mean score : %s' 
                      % (n_epoch-3, n_epoch+1, mean_score, self.best_mean_score))
                self.mean_score = []
            
        else:
            done = False
            reward = 0
            
        s = self.step_count
        state = np.concatenate((np.array(list(self.process_time.values())),
                                self.shortage[s//24+3:(s//24+31)].reshape(-1)))
            
        return state, reward, done, info

In [4]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.buffer = []
        
        input_shape = 4 * 28 + 2 ## 향후 4주치 order (shortage) & Process time
        hidden_size_1 = input_shape * 2
        hidden_size_2 = input_shape * 2
        hidden_size_3 = input_shape
        output_shape = 5  # CHECK 1, 2, 3, 4 & PROCESS
        
        self.fc1 = nn.Linear(input_shape, hidden_size_1)
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.fc3 = nn.Linear(hidden_size_2, hidden_size_3)
        
        self.fc_pi_A = nn.Linear(hidden_size_3, output_shape)
        self.fc_pi_B = nn.Linear(hidden_size_3, output_shape)
        self.fc_v  = nn.Linear(hidden_size_3, 1)
        
        self.epoch = 1
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate/(self.epoch // 200 + 1))
        
        
    def pi(self, x, env=None, update=False, softmax_dim=0):
        if update==True:
            env.update_mask()
        x = self.fc1(x)
        x = F.tanh(self.fc2(x))
        x = F.tanh(self.fc3(x))
        x_A = self.fc_pi_A(x)
        x_B = self.fc_pi_B(x)
        
        if softmax_dim==0:
            prob_A = F.softmax(x_A, dim=softmax_dim) * torch.tensor(env.mask['A'])
            prob_B = F.softmax(x_B, dim=softmax_dim) * torch.tensor(env.mask['B'])
        elif softmax_dim==1:
            prob_A = F.softmax(x_A, dim=softmax_dim)
            prob_B = F.softmax(x_B, dim=softmax_dim)
            
        return prob_A, prob_B
    
    
    def v(self, x):
        x = self.fc1(x)
        x = F.tanh(self.fc2(x))
        x = F.tanh(self.fc3(x))
        v = self.fc_v(x)
        return v
      
        
    def store(self, transition):
        self.buffer.append(transition)
        
        
    def train_net(self):
        states = torch.tensor([e[0] for e in self.buffer], dtype=torch.float)
        actions_A = torch.tensor([[e[1]] for e in self.buffer])
        actions_B = torch.tensor([[e[2]] for e in self.buffer])
        rewards = torch.tensor([[e[3]] for e in self.buffer], dtype=torch.float)
        next_states = torch.tensor([e[4] for e in self.buffer], dtype=torch.float)
        prob_actions_A = torch.tensor([[e[5]] for e in self.buffer], dtype=torch.float)
        prob_actions_B =torch.tensor([[e[6]] for e in self.buffer], dtype=torch.float)
        dones = torch.tensor([[1-e[7]] for e in self.buffer])
        self.buffer = []

        for _ in range(train_iter):
            td_target = rewards + gamma * self.v(next_states) * dones
            delta = td_target - self.v(states)
            delta = delta.detach().numpy()

            advantages = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lam * advantage + delta_t[0]
                advantages.append([advantage])
            advantages.reverse()
            advs = torch.tensor(advantages, dtype=torch.float)

            new_probs_A, new_probs_B = self.pi(states, softmax_dim=1)
            
            new_prob_actions_A = new_probs_A.gather(1, actions_A)
            new_prob_actions_B = new_probs_B.gather(1, actions_B)
            
            ratio_A = torch.exp(torch.log(new_prob_actions_A) - torch.log(prob_actions_A))
            ratio_B = torch.exp(torch.log(new_prob_actions_B) - torch.log(prob_actions_B))
            
            surr1_A = ratio_A * advs
            surr2_A = torch.clamp(ratio_A, 1-clip_range, 1+clip_range) * advs
            
            surr1_B = ratio_B * advs
            surr2_B = torch.clamp(ratio_B, 1-clip_range, 1+clip_range) * advs

            pi_loss_A = -torch.mean(torch.min(surr1_A, surr2_A))
            pi_loss_B = -torch.mean(torch.min(surr1_B, surr2_B))
            vf_loss = torch.mean(torch.pow(self.v(states) - td_target.detach(), 2))

            loss = pi_loss_A + pi_loss_B + vf_loss

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [5]:
class GracefulKiller:
    def __init__(self):
        self.kill_now = False
        signal.signal(signal.SIGINT, self.exit_gracefully)
        signal.signal(signal.SIGTERM, self.exit_gracefully)

    def exit_gracefully(self, signum, frame):
        self.kill_now = True

In [6]:
def main():
    env = myEnv(is_train)
    killer = GracefulKiller()

    model = PPO()
    
    if os.path.exists("save.pt"):
        print("model loaded!")
        checkpoint = torch.load("save.pt")
        model.load_state_dict(checkpoint["model"])

    if not is_train:
        model.eval()

    for i in tqdm(itertools.count()):
        state = env.reset()
        done = False
        while not done:
            for t in range(HORIZON):
                
                prob_A, prob_B = model.pi(torch.from_numpy(state).float(), env=env, update=True)
                m_A = Categorical(prob_A)
                m_B = Categorical(prob_B)
                
                action_A = m_A.sample().item()
                action_B = m_B.sample().item()
                prob_action_A = prob_A[action_A].item()
                prob_action_B = prob_B[action_B].item()
                
                next_state, reward, done, info = env.step(action_A, action_B, i)
                if info['saved']:
                    torch.save({"model": model.state_dict()}, f"save_score_{info['score']:.2f}.pt")
                model.store((state, action_A, action_B, reward,
                             next_state, prob_action_A, prob_action_B, done))
                
                state = next_state
                if done:
                    break
            model.train_net()
        model.epoch += 1
        
                               
        if i == 999:
            killer.kill_now = True
                               
                               
        if (i+1)%save_interval==0:
            torch.save({"model": model.state_dict()}, f"save_epoch_{i+1}.pt")
            if killer.kill_now:
                if input('Terminate training (y/[n])? ') == 'y':
                    break
                killer.kill_now = False

In [7]:
%%time
if __name__ == '__main__':
    main()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Epoch 1~5 mean score : 55.759431326328695 / best mean score : 55.759431326328695
Epoch 6~10 mean score : 56.52649619072694 / best mean score : 56.52649619072694
Epoch 11~15 mean score : 55.145271871715714 / best mean score : 56.52649619072694
Epoch 16~20 mean score : 55.77624054440393 / best mean score : 56.52649619072694
Epoch 21~25 mean score : 55.16228323517404 / best mean score : 56.52649619072694
Epoch 26~30 mean score : 56.07618626007386 / best mean score : 56.52649619072694
Epoch 31~35 mean score : 56.83494233480375 / best mean score : 56.83494233480375
Epoch 36~40 mean score : 57.73759667572225 / best mean score : 57.73759667572225
Epoch 41~45 mean score : 57.2215808936362 / best mean score : 57.73759667572225
Epoch 46~50 mean score : 57.250647679337206 / best mean score : 57.73759667572225
Epoch 51~55 mean score : 58.42926690348603 / best mean score : 58.42926690348603
Epoch 56~60 mean score : 58.576292278875826 / best mean score : 58.576292278875826
Epoch 61~65 mean score : 5

In [None]:
def predict():
    is_train = False
    env = myEnv(is_train)
    killer = GracefulKiller()

    model = PPO()
    
    if os.path.exists("save.pt"):
        print("model loaded!")
        checkpoint = torch.load("save.pt")
        model.load_state_dict(checkpoint["model"])

    if not is_train:
        model.eval()
    
    state = env.reset()
    done = False
    for t in range(HORIZON):
        prob_A, prob_B = model.pi(torch.from_numpy(state).float(), env=env, update=True)
        action_A = np.argmax(prob_A.detach().numpy())
        action_B = np.argmax(prob_B.detach().numpy())

        next_state, reward, done, info = env.step(action_A, action_B, 0)

        state = next_state
        if done:
            break
    
    submission, score = env.submission, env.best_score
    env.save_csv(submission, score)

In [None]:
# predict()