In [1]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scipy!=1.9.2,>=1.4
  Downloading scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.5/36.5 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting patsy>=0.5.2
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.8/233.8 kB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, patsy, statsmodels
  Attempting uninstall: scipy
    Found existing installation: scipy 1.9.2
    Uninstalling scipy-1.9.2:
      Successfully uninstalled scipy-1.9.2
Successfully installed patsy-0.5.3 scipy-1.11.2 statsmodels-0.1

In [13]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.seasonal import seasonal_decompose

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [16]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [17]:
def train(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None

    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)

            optimizer.zero_grad()

            output = model(X)
            loss = criterion(output, Y)

            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        #val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')

        # if best_loss > val_loss:
        #     best_loss = val_loss
        #     best_model = model
        #     print('Model Saved')
    return model

In [18]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []

    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)

            output = model(X)
            loss = criterion(output, Y)

            val_loss.append(loss.item())
    return np.mean(val_loss)

In [19]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)

            output = model(X)

            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [20]:
# 0 값을 보간하는 함수 정의
def improved_fill_zeros_with_median_and_selective_interpolation_v6(data):
    # 0이 아닌 값들만 추출
    non_zero_data = [value for value in data if value != 0]
    
    # 모든 값이 0인 경우, 원본 데이터 반환
    if len(non_zero_data) == 0: 
        return data
    
    # 0이 아닌 값들의 중앙값 계산
    median_value = np.median(non_zero_data)
    filled_data = data.copy()
    
    # 0이 아닌 값들의 인덱스를 찾음
    non_zero_indices = np.where(data != 0)[0]
    
    # 배열의 시작 부분에 연속된 0 값들을 중앙값과 첫 번째 0이 아닌 값으로 선형 보간
    if non_zero_indices[0] != 0:
        filled_data[:non_zero_indices[0]] = np.linspace(median_value, data[non_zero_indices[0]], non_zero_indices[0]+1)[:-1]
    
    # 배열의 끝 부분에 연속된 0 값들을 마지막 0이 아닌 값과 중앙값으로 선형 보간
    if non_zero_indices[-1] != len(data) - 1:
        filled_data[non_zero_indices[-1]+1:] = np.linspace(data[non_zero_indices[-1]], median_value, len(data)-non_zero_indices[-1])[:-1]
    
    # 0이 아닌 값들 사이의 0 값들을 선형 보간
    for i in range(len(non_zero_indices) - 1):
        start, end = non_zero_indices[i], non_zero_indices[i+1]
        filled_data[start+1:end] = np.linspace(data[start], data[end], end-start+1)[1:-1]
    
    return filled_data

def add_noise_to_interpolated_values_v2(original_data, processed_data, std_dev=0.5):
    """Add Gaussian noise to the interpolated values in the processed data."""
    noise = np.where(original_data == 0, np.random.normal(0, std_dev, len(original_data)), 0)
    noisy_processed_data = processed_data + noise
    return noisy_processed_data

def add_bootstrap_noise_to_interpolated_values(original_data, processed_data):
    """Add bootstrap noise to the interpolated values in the processed data."""
    non_zero_data = [value for value in original_data if value != 0]
    if len(non_zero_data) == 0:
        return processed_data
    noise_values = np.random.choice(non_zero_data, size=len(original_data), replace=True) - np.median(non_zero_data)
    noisy_processed_data = np.where(original_data == 0, processed_data + noise_values, processed_data)
    return noisy_processed_data


In [21]:
seed_list=list(range(41, 42))
seed_list

[41]

In [23]:
# seed_list=list(range(41, 41+20))
seed_list=list(range(41, 42))
tqdm
for seed in seed_list:
    CFG = {
    'TRAIN_WINDOW_SIZE':35, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':8,
    'LEARNING_RATE':0.001,
    'BATCH_SIZE':1024,
    'SEED':seed
    }

    seed_everything(CFG['SEED']) # Seed 고정

#     train_data = pd.read_csv('./train.csv').drop(columns=['ID', '제품'])
#     zero_ratios = (train_data.iloc[:,6:] == 0).mean(axis=1) *100
#     ids_with_low_zeros = zero_ratios[zero_ratios <=70].index
#     train_data_copy = train_data.copy()
#     for id in tqdm(ids_with_low_zeros):
#         sales_series = train_data_copy.loc[id, train_data_copy.columns[6:]]

#         if sales_series.iloc[0] == 0:
#             sales_series = sales_series.replace({0:np.nan}).bfill()

#         if sales_series.iloc[-1] == 0:
#             sales_series = sales_series.replace({0:np.nan}).ffill()

#         interpolated_sales = sales_series.interpolate()

#         train_data_copy.loc[id, train_data_copy.columns[6:]] = interpolated_sales.values

#     train_data = train_data_copy.copy()

    train_data = pd.read_csv('train.csv') #.drop(columns=['ID', '제품'])
        
    for i in range(len(train_data)):
        value = list(train_data.iloc[i][6:])
        interpolated_sales_data = improved_fill_zeros_with_median_and_selective_interpolation_v6(np.array(value))
        noisy_interpolated_sales_data = add_noise_to_interpolated_values_v2(np.array(value), interpolated_sales_data)
        #noisy_interpolated_sales_data = add_bootstrap_noise_to_interpolated_values(np.array(value), interpolated_sales_data)

        #train_data.iloc[i, 6:] = noisy_interpolated_sales_data
        train_data.iloc[i, 6:] = interpolated_sales_data

    # 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
    numeric_cols = train_data.columns[6:]
    # 칵 column의 min 및 max 계산
    min_values = train_data[numeric_cols].min(axis=1)
    max_values = train_data[numeric_cols].max(axis=1)
    # 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
    ranges = max_values - min_values
    ranges[ranges == 0] = 1
    # min-max scaling 수행
    train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
    # max와 min 값을 dictionary 형태로 저장
    scale_min_dict = min_values.to_dict()
    scale_max_dict = max_values.to_dict()

    def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
        '''
        학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
        data : 일별 판매량
        train_size : 학습에 활용할 기간
        predict_size : 추론할 기간
        '''
        num_rows = len(data)
        window_size = train_size + predict_size

        # encode_info를 포함하지 않으므로 shape 변경
        input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, 1))
        target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))

        for i in tqdm(range(num_rows)):
            sales_data = np.array(data.iloc[i, 6:])  # 첫 4개 열을 제외하고 가져옵니다.

            for j in range(len(sales_data) - window_size + 1):
                window = sales_data[j : j + window_size]
                input_data[i * (len(data.columns) - window_size + 1) + j] = window[:train_size].reshape(-1, 1)
                target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]

        return input_data, target_data
    
    def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
        '''
        평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
        data : 일별 판매량
        train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
        '''
        num_rows = len(data)

        # encode_info를 포함하지 않으므로 shape 변경
        input_data = np.empty((num_rows, train_size, 1))

        for i in tqdm(range(num_rows)):
            sales_data = np.array(data.iloc[i, -train_size:])
            input_data[i] = sales_data.reshape(-1, 1)  # shape을 (train_size, 1)로 변경

        return input_data
    
    train_input, train_target = make_train_data(train_data)
    test_input = make_predict_data(train_data)
    data_len = len(train_input)

    train_dataset = CustomDataset(train_input, train_target)
    train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)
    '''
    class BaseModel(nn.Module):
        def __init__(self, input_size=1, hidden_size=512, output_size=CFG['PREDICT_SIZE']):
            super(BaseModel, self).__init__()
            self.hidden_size = hidden_size
            self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2, hidden_size//2),
                nn.ReLU(),
                nn.Dropout(),
                nn.Linear(hidden_size//2, output_size)
            )

            self.actv = nn.ReLU()

        def forward(self, x):
            # x shape: (B, TRAIN_WINDOW_SIZE, 5)
            batch_size = x.size(0)
            hidden = self.init_hidden(batch_size, x.device)

            # LSTM layer
            lstm_out, hidden = self.lstm(x, hidden)

            # Only use the last output sequence
            last_output = lstm_out[:, -1, :]

            # Fully connected layer
            output = self.actv(self.fc(last_output))

            return output.squeeze(1)

        def init_hidden(self, batch_size, device):
            # Initialize hidden state and cell state
            return (torch.zeros(2, batch_size, self.hidden_size, device=device),
                    torch.zeros(2, batch_size, self.hidden_size, device=device))
    '''
    class BaseModel(nn.Module):
        def __init__(self, input_size=1, hidden_size1=1024, hidden_size2=512, output_size=21):
            super(BaseModel, self).__init__()
            
            # First LSTM layer
            self.lstm1 = nn.LSTM(input_size, hidden_size1, batch_first=True, bidirectional=True)
            
            # Second LSTM layer
            self.lstm2 = nn.LSTM(hidden_size1*2, hidden_size2, batch_first=True, bidirectional=True)
            
            # Fully connected layer
            self.fc = nn.Sequential(
                nn.Linear(hidden_size2 * 2, hidden_size2 // 2),
                nn.ReLU(),
                nn.Linear(hidden_size2 // 2, output_size)
            )

            self.actv = nn.ReLU()

        def forward(self, x):
            batch_size = x.size(0)
            
            # Initialize hidden states
            h0_1, c0_1 = self.init_hidden(batch_size, x.device, self.lstm1.hidden_size)
            h0_2, c0_2 = self.init_hidden(batch_size, x.device, self.lstm2.hidden_size)
            
            # First LSTM layer
            lstm_out1, _ = self.lstm1(x, (h0_1, c0_1))
            
            # Second LSTM layer
            lstm_out2, _ = self.lstm2(lstm_out1, (h0_2, c0_2))
            
            # Only use the last output sequence
            last_output = lstm_out2[:, -1, :]
            
            # Fully connected layer
            output = self.actv(self.fc(last_output))
            
            return output.squeeze(1)

        def init_hidden(self, batch_size, device, hidden_size):
            # Initialize hidden state and cell state for LSTM
            return (torch.zeros(2, batch_size, hidden_size, device=device),
                    torch.zeros(2, batch_size, hidden_size, device=device))



    def train(model, optimizer, train_loader, device):
        model.to(device)
        criterion = nn.MSELoss().to(device)
        best_loss = 9999999
        best_model = None

        for epoch in range(1, CFG['EPOCHS']+1):
            model.train()
            train_loss = []
            train_mae = []
            for X, Y in tqdm(iter(train_loader)):
                X = X.to(device)
                Y = Y.to(device)

                optimizer.zero_grad()

                output = model(X)
                loss = criterion(output, Y)

                loss.backward()
                optimizer.step()

                train_loss.append(loss.item())

            #val_loss = validation(model, val_loader, criterion, device)
            print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : []')

            # if best_loss > val_loss:
            #     best_loss = val_loss
            #     best_model = model
            #     print('Model Saved')
        return model

    model = BaseModel()
    optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
    infer_model = train(model, optimizer, train_loader, device)

    test_dataset = CustomDataset(test_input, None)
    test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

    pred = inference(model, test_loader, device)

    # 추론 결과를 inverse scaling
    for idx in range(len(pred)):
        pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

    # 결과 후처리
    pred = np.round(pred, 0).astype(int)

    submit = pd.read_csv('sample_submission.csv')
    submit.iloc[:,1:] = pred
    os.makedirs('./seed_new_interpolate', exist_ok=True)
    submit.to_csv(f'bilstm_nofeature_35_imp3_noiseX_ChangeHyper_{seed}.csv',index=False)

100%|██████████| 15890/15890 [00:25<00:00, 627.68it/s]
100%|██████████| 15890/15890 [00:06<00:00, 2283.07it/s]
 50%|████▉     | 3161/6363 [23:22<23:58,  2.23it/s]

In [None]:
# sub_list = sorted(os.listdir('./seed'))
# sub_list

In [None]:
# submit = pd.read_csv('./sample_submission.csv')
# for i in sub_list[1:]:
#     df = pd.read_csv('./seed/' + i)
#     submit[submit.columns[1:]] += df[df.columns[1:]]
    
# submit[submit.columns[1:]] = submit[submit.columns[1:]] / len(sub_list[1:])
# submit[submit.columns[1:]] = np.round(submit[submit.columns[1:]])
# submit = submit.astype('int')
# submit.to_csv('./bilstm_nofeat_35_impute_seed_ensemble.csv', index=False)

In [None]:
# submit = pd.read_csv('./sample_submission.csv')
# for i in sub_list[1:7]:
#     df = pd.read_csv('./seed/' + i)
#     submit[submit.columns[1:]] += df[df.columns[1:]]
    
# submit[submit.columns[1:]] = submit[submit.columns[1:]] / len(sub_list[1:7])
# submit[submit.columns[1:]] = np.round(submit[submit.columns[1:]])
# submit = submit.astype('int')
# submit.to_csv('./bilstm_nofeat_35_impute_seed_ensemble_6.csv', index=False)