----
# I. Data preprocessing

## 1. Read dataset

In [14]:
import pandas as pd

train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")
submission = pd.read_csv("../dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [15]:
def means(df):
    res = {}
    for i in range(1, 6):
        res[f"Promotion{i}"] = df[f"Promotion{i}"].mean()
    return res


def medians(df):
    res = {}
    for i in range(1, 6):
        res[f"Promotion{i}"] = df[f"Promotion{i}"].median()
    return res


train = train.fillna(0)

### b) `test.csv`

In [16]:
test = test.fillna(value=means(test))

## 3. Data manipulation

### a) `Date`

In [17]:
import datetime as dt

def date_to_week(date):
    day, month, year = map(int, date.split('/'))
    t = dt.datetime(year, month, day) - dt.datetime(2010, 2, 5)
    return int(t.days // 7)


def date_split(date):
    return tuple(map(int, date.split('/')))


# train
train['Week'] = train['Date'].apply(date_to_week)
train["Day"] = train["Date"].apply(lambda x: date_split(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_split(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_split(x)[2])


# test
test['Week'] = test['Date'].apply(date_to_week)
test["Day"] = test["Date"].apply(lambda x: date_split(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_split(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_split(x)[2])

### b) `IsHoliday`

In [18]:
# trian
train["IsHoliday"] = train["IsHoliday"].apply(int)

# test
test["IsHoliday"] = test["IsHoliday"].apply(int)

### c) `Promotion1`, ... , `Promotion5`

In [19]:
# Scaling
from sklearn.preprocessing import RobustScaler


promos = ['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']

scaler = RobustScaler()
scaler.fit(train[promos])
train[promos] = scaler.transform(train[promos])
test[promos] = scaler.transform(test[promos])

### d) Data Scaling to all features

In [20]:
import numpy as np


train["Weekly_Sales"] = np.log1p(train["Weekly_Sales"])

## 4. Remove not using features

In [21]:
removes = ['id','Date','Temperature'] 

train = train.drop(columns=removes)
test = test.drop(columns=removes)

## 5. Divide by Store-trends

In [22]:
def divide(df, stores):
    divided = pd.DataFrame()
    for store in stores:
        divided = pd.concat([divided, df[df["Store"] == store].copy()])
    return divided


# def remove(df, stores):
#     for store in stores:
#         df = df.drop(df[df["Store"] == store].index)
#     return df
    

def store_one_hot(df):
    return pd.get_dummies(data=df, columns=["Store"])


trainA = divide(train, [36])
testA = divide(test, [36])

trainA = trainA[['Day','Month','Week','Weekly_Sales']]
testA = testA[['Day','Month','Week']]

train = store_one_hot(train)
test = store_one_hot(test)

train = train.drop(columns=['Week'])
test = test.drop(columns=['Week'])

----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [23]:
def split_pred(df):
    x_df = df.drop(columns=["Weekly_Sales"])
    y_df = df["Weekly_Sales"]
    return x_df, y_df
    

x_train, y_train = split_pred(train)
x_trainA, y_trainA = split_pred(trainA)

## 2. Modeling for each stores

In [24]:
predictions = {}

### a) XGBoost

In [25]:
import time
import xgboost as xgb
from sklearn.linear_model import LinearRegression as linear


start_t = time.time()

model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    learning_rate=0.1, 
    max_depth = 4, 
    n_estimators = 1000
)
model.fit(x_train, y_train)
y_pred = np.expm1(model.predict(test))


model = linear()
model.fit(x_trainA, y_trainA)
pred = np.expm1(model.predict(testA))
print(pred)


y_pred[35*4:36*4] = pred


predictions["XGBoost"] = y_pred

print(f"[XGBoost] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{y_pred[0]}, {y_pred[1]}, ... , {y_pred[-1]}")

[290243.55912224 285468.64658678 280772.28777095 276153.19036063]
[XGBoost] => time: 2.13(sec)
1682026.125, 1520418.875, ... , 789814.4375


----
# III. Submission

In [26]:
import os


def name(integer):
    return str(integer).zfill(2)


savetime = dt.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"../dataset/submissions/{folder}", exist_ok=True)
    submission.to_csv(f"../dataset/submissions/{folder}/{sub_folder+' ('+model+')'}.csv", index=False)
print(f"[Done!] Create {len(predictions)} csv file(s)")

[Done!] Create 1 csv file(s)
