## import

In [None]:
import random
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings("ignore")

## hyperparameter setting

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE': 90,
    'PREDICT_SIZE': 21,
    'EPOCHS': 10,
    'SEED': 41
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## loading data

In [None]:
train1 = pd.read_csv('preprocess_train_data.csv').drop(columns=['제품']).fillna(0)
train1

In [None]:
groups = train1.groupby('ID')

scale_min_dict = {}
scale_max_dict = {}

for name, group in groups:
    scale_min_dict[name] = group['판매량'].min()
    scale_max_dict[name] = group['판매량'].max()

In [None]:
scaler = MinMaxScaler()

def scale_series(s):
    return pd.Series(scaler.fit_transform(s.values.reshape(-1, 1)).flatten(), index=s.index)

train1['판매량'] = train1.groupby('ID')['판매량'].transform(scale_series)

scaler = MinMaxScaler()
train1['개당판매금액'] = scaler.fit_transform(train1['개당판매금액'].values.reshape(-1,1))
train1.head()

In [None]:
target = train1['판매량']
train1.drop(['판매량'], axis=1, inplace=True)
train1['판매량'] = target

## data preprocessing

In [None]:
# labelencoder
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train1[col])
    train1[col] = label_encoder.transform(train1[col])

In [None]:
def process_group(group, train_size, predict_size):
    group = group.reset_index(drop=True)
    window_size = train_size + predict_size

    sale_data = group.iloc[:,-1:]
    means = sale_data.rolling(window=train_size).mean().values.flatten()
    stds = sale_data.rolling(window=train_size).std().values.flatten()
    
    group.drop(['판매량'], axis=1, inplace=True)

    input_data, target_data = [], []
    for j in range(len(group) - window_size):
        group.loc[j:j+train_size, 'mean'] = means[j+train_size]
        group.loc[j:j+train_size, 'std'] = stds[j+train_size]
        group.loc[j:j+train_size, '판매량'] = sale_data[j:j+train_size]
        input_data.append(group.iloc[j:j+train_size].values)
        target_data.append(sale_data.iloc[j+train_size:j+window_size].values)
    return input_data, target_data

def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    grouped = data.groupby('ID')
    input_data, target_data = [], []
    
    results = Parallel(n_jobs=-1)(delayed(process_group)(group, train_size, predict_size) for _, group in tqdm(grouped, desc='Processing Groups'))

    for result in results:
        input_data.extend(result[0])
        target_data.extend(result[1])
    return np.array(input_data), np.squeeze(target_data)

In [None]:
train1_input, train1_target = make_train_data(train1)

In [None]:
def make_test_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    test_data = []
    data_group = data.groupby('ID')

    for _, group in tqdm(data_group):
        sale_data = group.iloc[:,-1:]

        means = sale_data.rolling(window=train_size).mean().values.flatten()
        stds = sale_data.rolling(window=train_size).std().values.flatten()

        group.loc[-train_size:, 'mean'] = means[-predict_size]
        group.loc[-train_size:, 'std'] = stds[-predict_size]

        target_y = group['판매량']
        group.drop(['판매량'], axis=1, inplace=True)
        group['판매량'] = target_y
        
        test_data.append(group.tail(train_size).values)
    return np.array(test_data)

In [None]:
test_input = make_test_data(train1)

In [None]:
train1_input.shape, train1_target.shape, test_input.shape

## npy save

In [None]:
np.save('./data/new_data/train1_input_mean_stds.npy', train1_input)
np.save('./data/new_data/train1_target_mean_stds.npy', train1_target)
np.save('./data/new_data/test_input_mean_stds.npy', test_input)