In [1]:
import random
import os
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=0)

In [2]:
train = pd.read_csv('../../DATA/train.csv')
test  = pd.read_csv('../../DATA/test.csv')

In [3]:

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# Preprocessing + Imputation of Data


In [4]:
# 데이터 타입 변경, 열 이름 변경 

new_column_names = {
    'corporation': 'corp',
    'location': 'loc',
    'supply(kg)': 'supply',
    'price(원/kg)': 'price',
}

train = train.rename(columns=new_column_names)
test = test.rename(columns=new_column_names)

train['timestamp']  = pd.to_datetime(train['timestamp'])
test['timestamp']  = pd.to_datetime(test['timestamp'])

train['newitem'] = train['item'].str.cat([train['corp'], train['loc']], sep=' ')

In [5]:
def get_date_info(data) :
    # data의 날짜 정보 추출하기 
    
    data['year'] = data['timestamp'].dt.year
    data['month'] = data['timestamp'].dt.month
    data['day'] = data['timestamp'].dt.day
    data['weekday'] = data['timestamp'].dt.weekday
    data['quarter'] = data['timestamp'].dt.quarter
    data['weekofyear'] = data['timestamp'].dt.isocalendar().week
    data['dayofyear'] = data['timestamp'].dt.dayofyear #해당 년도의 몇 일째 
    data['holi'] = 0 
    data.loc[(data['holi'] == 0) & (data['weekday'] >= 6), 'holi'] = 1
    
get_date_info(train)
get_date_info(test)

In [6]:
# 보간할 날, 가격 데이터 생성 
average_price = train.groupby(['year', 'month', 'item', 'corp','loc'])['price'].mean().astype(int).reset_index()


emptydays  = ['2019-01-01',  '2019-02-05', '2019-02-06', '2019-09-13', '2019-09-14', '2020-01-01', '2020-01-25', '2020-01-27', '2020-10-01', '2020-10-02', '2020-10-03', '2021-01-01', '2021-02-12', '2021-02-13', 
'2021-09-20', '2021-09-21', '2021-09-22', '2022-01-01', '2022-01-31', '2022-02-01', '2022-02-02', '2022-09-10', '2022-09-12', '2023-01-23', '2023-01-24']
# 25일에 해당하는 값들을 보간해야해 
emptydays = [datetime.strptime(day, "%Y-%m-%d") for day in emptydays]


In [7]:
# 공휴일 가격 데이터 보간하기 
for i in range(len(train)) :
    if train.loc[i, 'timestamp'] in emptydays :
        year,month,item, corp, loc = train.loc[i,'year'],train.loc[i,'month'],train.loc[i,'item'],train.loc[i,'corp'],train.loc[i,'loc']
        newprice = average_price[(average_price['year']==year) & (average_price['month']==month) & (average_price['item']==item) & (average_price['corp']==corp) & (average_price['loc']==loc)]['price']
        train.loc[i,'price'] = newprice.values[0]

In [8]:
#가격 시계열 price_data 생성하기 

unique_values = train['newitem'].unique()
# 날짜 범위 생성
start_date = datetime(2019, 1, 1).date()
end_date =datetime(2023, 3, 3).date()
dates = pd.date_range(start_date, end_date, freq='D')

# 데이터프레임 생성
price_data = pd.DataFrame(columns=['item'] + dates.strftime('%Y-%m-%d').tolist())
price_data['item'] = unique_values

# print(len(price_data)) 39 
for i in range(len(price_data)) :
    price_data.iloc[i,1:] = train['price'][i*1523 : (i+1)*1523 ]
price_data['item'] = price_data['item'].astype(str)
price_data['corp'] = price_data['item'].str[3:4]

price_data['loca'] = price_data['item'].str[-1:]
price_data['product'] = price_data['item'].str[:2]
price_data['product_loca'] =price_data['product'].str.cat(price_data['loca'], sep=' ')
cols = price_data.columns[-4:]  # 가장 뒤의 3개 열의 열 이름을 선택
price_data = price_data[cols.tolist() + price_data.columns[:-4].tolist()]  # 열 순서 변경

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
price_data['product_loca'] = le.fit_transform(price_data['product_loca'])


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [10]:
price_data

Unnamed: 0,corp,loca,product,product_loca,item,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,...,2023-02-22,2023-02-23,2023-02-24,2023-02-25,2023-02-26,2023-02-27,2023-02-28,2023-03-01,2023-03-02,2023-03-03
0,A,J,TG,8,TG A J,1513.0,0.0,1728.0,1408.0,1250.0,...,2813.0,2770.0,2633.0,3155.0,0.0,2657.0,3922.0,3397.0,3195.0,3640.0
1,A,S,TG,9,TG A S,1859.0,0.0,2526.0,2134.0,2075.0,...,3444.0,3481.0,3518.0,4201.0,0.0,4166.0,4009.0,4173.0,4219.0,4089.0
2,B,J,TG,8,TG B J,1231.0,0.0,1692.0,1516.0,1471.0,...,4175.0,6216.0,3558.0,2412.0,0.0,3540.0,3141.0,6382.0,3558.0,3470.0
3,B,S,TG,9,TG B S,1512.0,0.0,1944.0,1815.0,1717.0,...,3202.0,3478.0,3939.0,3677.0,0.0,4057.0,3821.0,4037.0,4004.0,4241.0
4,C,J,TG,8,TG C J,1649.0,0.0,1965.0,1794.0,1773.0,...,4017.0,4585.0,4835.0,5550.0,0.0,5037.0,2643.0,3742.0,3983.0,5175.0
5,C,S,TG,9,TG C S,1517.0,0.0,2078.0,2002.0,1815.0,...,3228.0,3484.0,3773.0,4298.0,0.0,4180.0,4234.0,4357.0,4466.0,4748.0
6,D,J,TG,8,TG D J,1164.0,0.0,1616.0,1337.0,1234.0,...,0.0,0.0,1838.0,1829.0,0.0,0.0,0.0,0.0,0.0,0.0
7,D,S,TG,9,TG D S,1652.0,0.0,2048.0,1757.0,1719.0,...,2539.0,2955.0,3323.0,3321.0,0.0,3409.0,3236.0,4631.0,4114.0,4146.0
8,E,J,TG,8,TG E J,1167.0,0.0,1692.0,1475.0,1392.0,...,3157.0,4793.0,2403.0,1489.0,0.0,2620.0,1673.0,0.0,1904.0,1622.0
9,E,S,TG,9,TG E S,1584.0,0.0,2151.0,1829.0,2002.0,...,3400.0,3349.0,3026.0,2946.0,0.0,3418.0,3141.0,4235.0,3960.0,3791.0


# Before DeepLearning

In [11]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    def __len__(self):
        return len(self.X)

In [12]:
def train(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    for epoch in range(1, 9):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, Y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')
    return model

In [13]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

In [14]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            output = model(X)
            output = output.cpu().numpy()
            predictions.extend(output)

    return np.array(predictions)

In [15]:
seed_list=[43,42,9402283]
seed_list

[43, 42, 9402283]

# Train 

In [16]:
# price data로 학습시켜야 한다
seed_list = [42,43,92891012]
for trainwindowsize in tqdm([28, 35, 42, 49], desc="Train Window Size"):
    for l_rate in tqdm([0.01, 0.02, 0.001, 0.0015, 0.002], desc="Learning Rate"):
        for batchisze in tqdm([32, 64, 128, 256, 512], desc="Batch Size"):
            for seed in tqdm([42], desc="Seed"):
               
                CFG = {
                'TRAIN_WINDOW_SIZE':trainwindowsize,
                'PREDICT_SIZE':28,
                'EPOCHS':10,
                'LEARNING_RATE':l_rate,
                'BATCH_SIZE':batchisze,
                'SEED':seed
                }

                seed_everything(CFG['SEED'])
                for shop in (range(1)):
                    train_data= price_data
                    numeric_cols = train_data.columns[5:]
                    def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
                        num_rows = len(data)
                       
                        # windowsize = 35 +28  = 63
                       
                        window_size = train_size + predict_size
                        input_data = np.empty((num_rows * (len(data.columns) - window_size + 1-5), train_size, 1))
                        target_data = np.empty((num_rows * (len(data.columns) - window_size + 1-5), predict_size))

                        for i in (range(num_rows)):
                            sales_data = np.array(data.iloc[i, 5:])  # 첫 4개 열을 제외하고 가져옵니다.
                           
                            for j in range(len(sales_data) - window_size + 1):
                                window = sales_data[j : j + window_size]
                                input_data[i * (len(data.columns) - window_size + 1-5) + j] = window[:train_size].reshape(-1, 1)
                                target_data[i * (len(data.columns) - window_size + 1-5) + j] = window[train_size:]

                       
                        return input_data, target_data

                    def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):

                        num_rows = len(data)

                        input_data = np.empty((num_rows, train_size, 1))

                        for i in (range(num_rows)):
                            sales_data = np.array(data.iloc[i, -train_size:])
                            input_data[i] = sales_data.reshape(-1, 1)  

                        return input_data




                    # print(f"seed is {seed}")
                    train_input, train_target = make_train_data(train_data)
                    test_input = make_predict_data(train_data)
                    data_len = len(train_input)

                    train_dataset = CustomDataset(train_input, train_target)
                    train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)
                   
                    class BaseModel(nn.Module):
                        def __init__(self, input_size=1, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
                            super(BaseModel, self).__init__()
                            self.hidden_size = hidden_size
                            self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
                            self.fc = nn.Sequential(
                                nn.Linear(hidden_size*2, hidden_size//2),
                                nn.ReLU(),
                                nn.Dropout(),
                                nn.Linear(hidden_size//2, output_size)
                            )

                            self.actv = nn.ReLU()

                        def forward(self, x):
                            batch_size = x.size(0)
                            hidden = self.init_hidden(batch_size, x.device)
                            lstm_out, hidden = self.lstm(x, hidden)
                            last_output = lstm_out[:, -1, :]
                            output = self.actv(self.fc(last_output))
                            return output.squeeze(1)

                        def init_hidden(self, batch_size, device):
                            return (torch.zeros(2, batch_size, self.hidden_size, device=device),
                                    torch.zeros(2, batch_size, self.hidden_size, device=device))

                    def train(model, optimizer, train_loader, device):
                        model.to(device)
                        criterion = nn.MSELoss().to(device)
                        best_loss = 9999999
                        best_model = None
                        last_loss = 0

                        for epoch in range(1, CFG['EPOCHS']+1):
                            model.train()
                            train_loss = []
                            train_mae = []
                            for X, Y in (iter(train_loader)):
                                X = X.to(device)
                                Y = Y.to(device)
                                optimizer.zero_grad()
                                output = model(X)
                                loss = criterion(output, Y)
                                loss.backward()
                                optimizer.step()

                                train_loss.append(loss.item())
                               
                            if epoch == CFG['EPOCHS'] :
                                last_loss = np.mean(train_loss)
                            # print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')
                        return model , last_loss
                   
                    model = BaseModel()
                    optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
                    infer_model , fin_loss  = train(model, optimizer, train_loader, device)
                    print(fin_loss ,l_rate, batchisze,trainwindowsize)
                    test_dataset = CustomDataset(test_input, None)
                    test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
                    pred = inference(infer_model, test_loader, device)
                   
                    # print(pred.shape)
                    # print(pred)

Train Window Size:   0%|          | 0/4 [00:00<?, ?it/s]

Learning Rate:   0%|          | 0/5 [00:00<?, ?it/s]

Batch Size:   0%|          | 0/5 [00:00<?, ?it/s]

Seed:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [17]:
# price data로 학습시켜야 한다 
# 1701193.5035430838 0.002 128 49
seed_list = [42,43,92891012]

for trainwindowsize in [49]:
    
    for l_rate in [0.002]:
        
        for batchisze in [128] :
             
            

            for seed in [42]:
                
                CFG = {
                'TRAIN_WINDOW_SIZE':trainwindowsize, 
                'PREDICT_SIZE':28, 
                'EPOCHS':10,
                'LEARNING_RATE':l_rate,
                'BATCH_SIZE':batchisze,
                'SEED':seed
                }

                seed_everything(CFG['SEED'])
                for shop in (range(1)):
                    train_data= price_data
                    numeric_cols = train_data.columns[5:]
                    def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
                        num_rows = len(data)
                        
                        # windowsize = 35 +28  = 63 
                        
                        window_size = train_size + predict_size
                        input_data = np.empty((num_rows * (len(data.columns) - window_size + 1-5), train_size, 1))
                        target_data = np.empty((num_rows * (len(data.columns) - window_size + 1-5), predict_size))

                        for i in (range(num_rows)):
                            sales_data = np.array(data.iloc[i, 5:])  # 첫 4개 열을 제외하고 가져옵니다.
                            
                            for j in range(len(sales_data) - window_size + 1):
                                window = sales_data[j : j + window_size]
                                input_data[i * (len(data.columns) - window_size + 1-5) + j] = window[:train_size].reshape(-1, 1)
                                target_data[i * (len(data.columns) - window_size + 1-5) + j] = window[train_size:]

                        
                        return input_data, target_data

                    def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):

                        num_rows = len(data)

                        input_data = np.empty((num_rows, train_size, 1))

                        for i in (range(num_rows)):
                            sales_data = np.array(data.iloc[i, -train_size:])
                            input_data[i] = sales_data.reshape(-1, 1)  

                        return input_data




                    # print(f"seed is {seed}")
                    train_input, train_target = make_train_data(train_data)
                    test_input = make_predict_data(train_data)
                    data_len = len(train_input)

                    train_dataset = CustomDataset(train_input, train_target)
                    train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)
                    
                    class BaseModel(nn.Module):
                        def __init__(self, input_size=1, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
                            super(BaseModel, self).__init__()
                            self.hidden_size = hidden_size
                            self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
                            self.fc = nn.Sequential(
                                nn.Linear(hidden_size*2, hidden_size//2),
                                nn.ReLU(),
                                nn.Dropout(),
                                nn.Linear(hidden_size//2, output_size)
                            )

                            self.actv = nn.ReLU()

                        def forward(self, x):
                            batch_size = x.size(0)
                            hidden = self.init_hidden(batch_size, x.device)
                            lstm_out, hidden = self.lstm(x, hidden)
                            last_output = lstm_out[:, -1, :]
                            output = self.actv(self.fc(last_output))
                            return output.squeeze(1)

                        def init_hidden(self, batch_size, device):
                            return (torch.zeros(2, batch_size, self.hidden_size, device=device),
                                    torch.zeros(2, batch_size, self.hidden_size, device=device))

                    def train(model, optimizer, train_loader, device):
                        model.to(device)
                        criterion = nn.MSELoss().to(device)
                        best_loss = 9999999
                        best_model = None
                        last_loss = 0 

                        for epoch in range(1, CFG['EPOCHS']+1):
                            model.train()
                            train_loss = []
                            train_mae = []
                            for X, Y in (iter(train_loader)):
                                X = X.to(device)
                                Y = Y.to(device)
                                optimizer.zero_grad()
                                output = model(X)
                                loss = criterion(output, Y)
                                loss.backward()
                                optimizer.step()

                                train_loss.append(loss.item())
                                
                            if epoch == CFG['EPOCHS'] :
                                last_loss = np.mean(train_loss)
                            # print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')
                        return model , last_loss
                    
                    model = BaseModel()
                    optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
                    infer_model , fin_loss  = train(model, optimizer, train_loader, device)
                    print(fin_loss ,l_rate, batchisze,trainwindowsize)
                    test_dataset = CustomDataset(test_input, None)
                    test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
                    pred = inference(infer_model, test_loader, device)
                    
                    # print(pred.shape)
                    # print(pred)


       

1701193.5035430838 0.002 128 49


  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
# pred.shape
pred = pred.reshape(-1 , 1)
pred.shape

(1092, 1)

In [20]:
submit = pd.read_csv('./../../DATA/sample_submission.csv')
submit

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [22]:
submit['answer'] = pred 

In [24]:
submit.loc[submit['answer'] <= 0, 'answer'] = 0
submit.loc[submit.index % 7 == 1, 'answer'] = 0

In [26]:
submit.to_csv('../../DATA/SUBMIT/lstm1.csv', index=False)