----
# I. Data preprocessing

## 1. Read dataset

In [26]:
import pandas as pd

train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [27]:
def means(df):
    res = {}
    for i in range(1, 6):
        res[f"Promotion{i}"] = df[f"Promotion{i}"].mean()
    return res


def medians(df):
    res = {}
    for i in range(1, 6):
        res[f"Promotion{i}"] = df[f"Promotion{i}"].median()
    return res


train = train.fillna(0)

### b) `test.csv`

In [28]:
test = test.fillna(value=means(test))

## 3. Data manipulation

### a) `Date`

In [29]:
import datetime as dt

def date_to_week(date):
    day, month, year = map(int, date.split('/'))
    t = dt.datetime(year, month, day) - dt.datetime(2010, 2, 5)
    return int(t.days // 7)

def date_split(date):
    return tuple(map(int, date.split('/')))


# train
train['Week'] = train['Date'].apply(date_to_week)
train["Day"] = train["Date"].apply(lambda x: date_split(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_split(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_split(x)[2])


# test
test['Week'] = test['Date'].apply(date_to_week)
test["Day"] = test["Date"].apply(lambda x: date_split(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_split(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_split(x)[2])

### b) `IsHoliday`

In [30]:
# trian
train["IsHoliday"] = train["IsHoliday"].apply(int)

# test
test["IsHoliday"] = test["IsHoliday"].apply(int)

### c) `Promotion1`, ... , `Promotion5`

In [31]:
# Scaling
from sklearn.preprocessing import RobustScaler


promos = ['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']

scaler = RobustScaler()
scaler.fit(train[promos])
train[promos] = scaler.transform(train[promos])
test[promos] = scaler.transform(test[promos])

### d) Data Scaling to all features

In [32]:
import numpy as np


train["Weekly_Sales"] = np.log1p(train["Weekly_Sales"])

## 4. Remove not using features

In [33]:
removes = ['id','Date','Temperature'] 

train = train.drop(columns=removes)
test = test.drop(columns=removes)

## 5. Divide by Store-trends

In [34]:
def divide(df, stores):
    divided = pd.DataFrame()
    for store in stores:
        divided = pd.concat([divided, df[df["Store"] == store].copy()])
    return divided


def remove(df, stores):
    for store in stores:
        df = df.drop(df[df["Store"] == store].index)
    return df
    

def store_one_hot(df):
    return pd.get_dummies(data=df, columns=["Store"])
#####################################

trainA = divide(train, [36])
train = remove(train, [36])

testA = divide(test, [36])
test = remove(test, [36])

#

trainB = divide(train, [38])
train = remove(train, [38])

testB = divide(test, [38])
test = remove(test, [38])

#

trainC = divide(train, [33])
train = remove(train, [33])

testC = divide(test, [33])
test = remove(test, [33])

#

trainD = divide(train, [42])
train = remove(train, [42])

testD = divide(test, [42])
test = remove(test, [42])

#

trainE = divide(train, [43])
train = remove(train, [43])

testE = divide(test, [43])
test = remove(test, [43])

#

trainF = divide(train, [44])
train = remove(train, [44])

testF = divide(test, [44])
test = remove(test, [44])

#################################################################

trainA = trainA[['Day','Month','Week','Weekly_Sales']]
testA = testA[['Day','Month','Week']]

trainB = trainB[['Day','Month','Week','Weekly_Sales','IsHoliday']]
testB = testB[['Day','Month','Week','IsHoliday']]


trainC = trainC[['Day','Month','Week','Weekly_Sales','IsHoliday']]
testC = testC[['Day','Month','Week','IsHoliday']]


trainD = trainD[['Day','Month','Week','Weekly_Sales','IsHoliday']]
testD = testD[['Day','Month','Week','IsHoliday']]


trainE = trainE[['Day','Month','Week','Weekly_Sales','IsHoliday']]
testE = testE[['Day','Month','Week','IsHoliday']]


trainF = trainF[['Day','Month','Week','Weekly_Sales','IsHoliday']]
testF = testF[['Day','Month','Week','IsHoliday']]
#################################################################

train = store_one_hot(train)
test = store_one_hot(test)

----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [35]:
def split_pred(df):
    x_df = df.drop(columns=["Weekly_Sales"])
    y_df = df["Weekly_Sales"]
    return x_df, y_df
    

x_train, y_train = split_pred(train)
x_trainA, y_trainA = split_pred(trainA)
x_trainB, y_trainB = split_pred(trainB)
x_trainC, y_trainC = split_pred(trainC)
x_trainD, y_trainD = split_pred(trainD)
x_trainE, y_trainE = split_pred(trainE)
x_trainF, y_trainF = split_pred(trainF)

## 2. Modeling for each stores

In [36]:
predictions = {}

### a) XGBoost

In [37]:
import time
import xgboost as xgb
from sklearn.linear_model import LinearRegression as linear


start_t = time.time()

model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    learning_rate=0.1, 
    max_depth = 4, 
    n_estimators = 1000
)
model.fit(x_train, y_train)
pred = np.expm1(model.predict(test))


def unify(pred, members):
    for i, k in enumerate(members):
        y_pred[(k-1)*4:k*4] = pred[i*4:(i+1)*4]
        

model = linear()
model.fit(x_trainA, y_trainA)
predA = np.expm1(model.predict(testA))
print(predA)

model = linear()
model.fit(x_trainB, y_trainB)
predB = np.expm1(model.predict(testB))
print(predB)

model = xgb.XGBRegressor()
model.fit(x_trainC, y_trainC)
predC = np.expm1(model.predict(testC))
print(predC)


model = xgb.XGBRegressor()
model.fit(x_trainD, y_trainD)
predD = np.expm1(model.predict(testD))
print(predD)

model = xgb.XGBRegressor()
model.fit(x_trainE, y_trainE)
predE = np.expm1(model.predict(testE))
print(predE)

model = xgb.XGBRegressor()
model.fit(x_trainF, y_trainF)
predF = np.expm1(model.predict(testF))
print(predF)


y_pred = [0] * 180
unify(predA, [36])
unify(predB, [38])
unify(predC, [33])
unify(predD, [42])
unify(predE, [43])
unify(predF, [44])



j = 0
for i in range(len(y_pred)):
    if y_pred[i] == 0:
        y_pred[i] = pred[j]
        j += 1

predictions["XGBoost"] = y_pred

print(f"[XGBoost] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{y_pred[0]}, {y_pred[1]}, ... , {y_pred[-1]}")

[290243.55912224 285468.64658678 280772.28777095 276153.19036063]
[465268.39673659 449458.90492457 434186.60733401 419433.25052358]
[262282.03 279813.06 258359.08 243765.47]
[596982.9  597954.4  528412.94 511566.62]
[637221.94 630554.1  618927.4  588358.44]
[350392.9  346539.66 341187.88 349898.03]
[XGBoost] => time: 2.26(sec)
1658464.75, 1550350.25, ... , 769050.125


----
# III. Submission

In [38]:
import os


def name(integer):
    return str(integer).zfill(2)


savetime = dt.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"dataset/submissions/{folder}", exist_ok=True)
    submission.to_csv(f"dataset/submissions/{folder}/{sub_folder+' ('+model+')'}.csv", index=False)
print(f"[Done!] Create {len(predictions)} csv file(s)")

[Done!] Create 1 csv file(s)
