In [None]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
seed = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed) # Seed 고정

In [None]:
p_info = pd.read_csv('/content/drive/MyDrive/lgaimers3/product_info.csv')
brand = pd.read_csv('/content/drive/MyDrive/lgaimers3/brand_keyword_cnt.csv')
sales = pd.read_csv('/content/drive/MyDrive/lgaimers3/sales.csv')
sub = pd.read_csv('/content/drive/MyDrive/lgaimers3/sample_submission.csv')
train = pd.read_csv('/content/drive/MyDrive/lgaimers3/train.csv')
train.head()

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,2,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,B002-00002-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,B002-00003-00001,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
hols = ['2022-01-01', '2022-01-31', '2022-02-01', '2022-02-02', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15',
        '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25', '2023-01-01', '2023-01-21', '2023-01-22',
        '2023-01-23', '2023-01-24', '2023-03-01']

def make_dataset(df):
    new_df = df.copy()
    date = pd.to_datetime(new_df['date'])

    new_df['day'] = date.dt.weekday
    new_df['month'] = date.dt.month
    new_df['year'] = date.dt.year
    new_df['week'] = date.dt.isocalendar().week.astype(np.uint8)

    new_df['holiday'] = new_df.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)
    new_df['holiday'] = new_df['date'].map(lambda x: 1 if x in hols else 0)

    for window_size in [7, 14]:
        new_df[f'{window_size}_vol_avg'] = new_df['volume'].rolling(window=window_size).mean()
        new_df[f'{window_size}_vol_max'] = new_df['volume'].rolling(window=window_size).max()
        new_df[f'{window_size}_vol_min'] = new_df['volume'].rolling(window=window_size).min()
        new_df[f'{window_size}_vol_max_min'] = new_df[f'{window_size}_vol_max'] - new_df[f'{window_size}_vol_min']

        # new_df[f'{window_size}_price_avg'] = new_df['price'].rolling(window=window_size).mean()
        # new_df[f'{window_size}_price_max'] = new_df['price'].rolling(window=window_size).max()
        # new_df[f'{window_size}_price_min'] = new_df['price'].rolling(window=window_size).min()
        # new_df[f'{window_size}_price_max_min'] = new_df[f'{window_size}_price_max'] - new_df[f'{window_size}_price_min']

        # new_df[f'{window_size}_brand_avg'] = new_df['brand'].rolling(window=window_size).mean()
        # new_df[f'{window_size}_brand_max'] = new_df['brand'].rolling(window=window_size).max()
        # new_df[f'{window_size}_brand_min'] = new_df['brand'].rolling(window=window_size).min()
        # new_df[f'{window_size}_brand_max_min'] = new_df[f'{window_size}_brand_max'] - new_df[f'{window_size}_brand_min']

    for i in range(1, 8):
        new_df[[f'diff_{i}_volume', f'diff_{i}_price', f'diff_{i}_brand']] = new_df[['volume', 'price', 'brand']].diff(i)

    for i in range(1, 8):
        new_df[[f'lagging_{i}_volume']] = new_df[['volume']].shift(i)

    for i in range(1, 22):
        new_df[f'label_{i}'] = new_df['volume'].shift(-i)

    return new_df

In [None]:
from tqdm import tqdm

total_preds = pd.DataFrame()

for id in tqdm(train['ID'].unique()):
    tmp = train[train['ID'] == id]
    tmp_sales = sales[sales['ID'] == id]
    tmp_brand = brand[brand['브랜드'] == tmp['브랜드'].values[0]]
    new_df = pd.DataFrame(tmp.iloc[:, 6:].T)
    new_df['sales'] = tmp_sales.iloc[:, 6:].T
    new_df['brand'] = tmp_brand.iloc[:, 1:].T
    new_df = new_df.reset_index()
    new_df.columns = ['date', 'volume', 'price', 'brand']

    new_df = make_dataset(new_df)

    new_df = new_df.drop(columns=['date'])

    train_df = new_df.iloc[13:-1].reset_index(drop=True)
    test_df = new_df.iloc[-1:].reset_index(drop=True)

    labels = []
    preds = []

    train_x = train_df.drop(columns=train_df.filter(regex='label').columns)
    test_x = test_df[train_x.columns]

    for label in train_df.filter(regex='label').columns:
        train_y = train_df[label]

        x_train = train_x[~train_y.isnull()]
        y_train = train_y[~train_y.isnull()]

        model = LGBMRegressor(random_state=seed)

        model.fit(x_train, y_train)
        pred = model.predict(test_x)
        pred = test_x['volume'] + pred
        preds.append(pred)

    preds = np.round(np.where(np.array(preds) <= 0, 0, np.array(preds))).reshape(-1,)
    sub.loc[sub['ID'] == id, '2023-04-05':] = preds

100%|██████████| 15890/15890 [4:34:40<00:00,  1.04s/it]


In [None]:
sub.to_csv('./submit.csv', index=False)
sub.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
1,1,0,1,1,1,1,1,1,1,2,...,1,1,1,1,1,0,1,1,1,1
2,2,0,1,1,6,4,1,2,4,1,...,4,1,1,1,1,0,1,2,0,0
3,3,0,1,5,6,11,14,10,11,0,...,2,1,0,2,1,0,0,0,0,1
4,4,0,0,0,0,0,1,0,0,1,...,1,0,0,1,2,5,1,3,0,0


In [None]:
sub.isnull().sum().sum()

0

In [None]:
ids = train[train.iloc[:, 6:].sum(axis=1) == 0].index
sub.iloc[ids, 1:].sum(axis=1).sum()

0