----
# I. Data preprocessing

## 1. Read dataset

In [49]:
import pandas as pd

train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [50]:
def means(df):
    res = {}
    for i in range(1, 6):
        res[f"Promotion{i}"] = df[f"Promotion{i}"].mean()
    print(res)
    return res


def medians(df):
    res = {}
    for i in range(1, 6):
        res[f"Promotion{i}"] = df[f"Promotion{i}"].median()
    print(res)
    return res


train = train.fillna(0)
#train = train.fillna(value=means(train))
#train = train.fillna(value=medians(train))

train

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales
0,1,1,05/02/2010,42.31,2.572,0.00,0.00,0.00,0.00,0.00,8.106,False,1643690.90
1,2,1,12/02/2010,38.51,2.548,0.00,0.00,0.00,0.00,0.00,8.106,True,1641957.44
2,3,1,19/02/2010,39.93,2.514,0.00,0.00,0.00,0.00,0.00,8.106,False,1611968.17
3,4,1,26/02/2010,46.63,2.561,0.00,0.00,0.00,0.00,0.00,8.106,False,1409727.59
4,5,1,05/03/2010,46.50,2.625,0.00,0.00,0.00,0.00,0.00,8.106,False,1554806.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,6251,45,31/08/2012,75.09,3.867,23641.30,6.00,92.93,6988.31,3992.13,8.684,False,734297.87
6251,6252,45,07/09/2012,75.70,3.911,11024.45,12.80,52.63,1854.77,2055.70,8.684,True,766512.66
6252,6253,45,14/09/2012,67.87,3.948,11407.95,0.00,4.30,3421.72,5268.92,8.684,False,702238.27
6253,6254,45,21/09/2012,65.32,4.038,8452.20,92.28,63.24,2376.38,8670.40,8.684,False,723086.20


### b) `test.csv`

In [51]:
#test = test.fillna(0)
test = test.fillna(value=means(test))
#test = test.fillna(value=medians(test)) - Almost Same with means()

{'Promotion1': 3308.837584269663, 'Promotion2': 64.67199999999998, 'Promotion3': 30.557888198757773, 'Promotion4': 1284.3400684931505, 'Promotion5': 3451.4237777777807}


## 3. Data manipulation

### a) `Date`

In [52]:
import datetime as dt

def date_to_week(date):
    day, month, year = map(int, date.split('/'))
    t = dt.datetime(year, month, day) - dt.datetime(2010, 2, 5)
    return t.days // 7


def date_split(date):
    return tuple(map(int, date.split('/')))


# train
#train["Week"] = train["Date"].apply(date_to_week)
train["Day"] = train["Date"].apply(lambda x: date_split(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_split(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_split(x)[2])


# test
#test["Week"] = test["Date"].apply(date_to_week)
test["Day"] = test["Date"].apply(lambda x: date_split(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_split(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_split(x)[2])

### b) `IsHoliday`

In [53]:
# trian
train["IsHoliday"] = train["IsHoliday"].apply(int)

# test
test["IsHoliday"] = test["IsHoliday"].apply(int)

### c) `Store`

In [54]:
# train
train = pd.get_dummies(data=train, columns=["Store"])

# test
test = pd.get_dummies(data=test, columns=["Store"])

### d) `Promotion1`, ... , `Promotion5`

In [55]:
# Scaling
from sklearn.preprocessing import RobustScaler


promos = ['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']

scaler = RobustScaler()
scaler.fit(train[promos])
train[promos] = scaler.transform(train[promos])
test[promos] = scaler.transform(test[promos])

### e) `Weekly_Sales` 

In [56]:
import numpy as np

train["Weekly_Sales"] = np.log1p(train["Weekly_Sales"])

## 4. Add new features

In [57]:
social_features = ["Temperature"]
dummy = train[social_features].copy()

## 4. Remove not using features

In [9]:
train = train.drop(columns=['id', 'Date'])
train = train.drop(columns=social_features)

test = test.drop(columns=['id', 'Date'])
test = test.drop(columns=social_features)

In [10]:
train

Unnamed: 0,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales,Day,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,2.572,0.000000,0.0,0.000000,0.000000,0.000000,8.106,0,14.312455,5,...,0,0,0,0,0,0,0,0,0,0
1,2.548,0.000000,0.0,0.000000,0.000000,0.000000,8.106,1,14.311400,12,...,0,0,0,0,0,0,0,0,0,0
2,2.514,0.000000,0.0,0.000000,0.000000,0.000000,8.106,0,14.292967,19,...,0,0,0,0,0,0,0,0,0,0
3,2.561,0.000000,0.0,0.000000,0.000000,0.000000,8.106,0,14.158908,26,...,0,0,0,0,0,0,0,0,0,0
4,2.625,0.000000,0.0,0.000000,0.000000,0.000000,8.106,0,14.256862,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,3.867,12.335020,60.0,33.248658,36.566951,2.226726,8.684,0,13.506671,31,...,0,0,0,0,0,0,0,0,0,1
6251,3.911,5.752087,128.0,18.830054,9.705248,1.146626,8.684,1,13.549608,7,...,0,0,0,0,0,0,0,0,0,1
6252,3.948,5.952181,0.0,1.538462,17.904453,2.938893,8.684,0,13.462029,14,...,0,0,0,0,0,0,0,0,0,1
6253,4.038,4.409997,922.8,22.626118,12.434619,4.836166,8.684,0,13.491285,21,...,0,0,0,0,0,0,0,0,0,1


In [11]:
test

Unnamed: 0,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Day,Month,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,3.617,4.214698,646.72,6.518784,18.928523,2.022584,6.573,0,5,10,...,0,0,0,0,0,0,0,0,0,0
1,3.601,1.088480,646.72,2.901610,3.151902,3.305649,6.573,0,12,10,...,0,0,0,0,0,0,0,0,0,0
2,3.594,0.495842,646.72,1.763864,0.419915,1.290059,6.573,0,19,10,...,0,0,0,0,0,0,0,0,0,0
3,3.506,1.349186,317.50,2.146691,5.531683,0.727907,6.573,0,26,10,...,0,0,0,0,0,0,0,0,0,0
4,3.617,3.150245,646.72,3.592129,15.840982,2.149345,6.170,0,5,10,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,3.755,0.020166,26.10,0.350626,6.720423,0.255318,5.217,0,26,10,...,0,0,0,0,0,0,0,0,1,0
176,3.985,2.633173,646.72,6.733453,11.791272,1.305208,8.667,0,5,10,...,0,0,0,0,0,0,0,0,0,1
177,4.000,1.020703,646.72,2.822898,3.135995,2.225839,8.667,0,12,10,...,0,0,0,0,0,0,0,0,0,1
178,3.969,1.045612,646.72,1.137746,2.290461,0.857580,8.667,0,19,10,...,0,0,0,0,0,0,0,0,0,1


----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [12]:
x_train = train.drop(columns=["Weekly_Sales"])
y_train = train["Weekly_Sales"]

## 2. Choose a suitable model

In [13]:
predictions = {}

### a) XGBoost

In [17]:
import time
import xgboost as xgb
from matplotlib import pyplot as plt


start_t = time.time()


model = xgb.XGBRegressor(
        objective='reg:squarederror',
        max_depth = 4,#int(5.80783356446797),
        learning_rate = 0.14127347721138503,
        n_estimators = 1000,#int(555.642549346457),
        min_child_weight = 1.7299171552396175,
        subsample = 0.5246698466409756,
        colsample_bytree = 0.8635613858768121, nthread = -1)

#model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth = 4, n_estimators = 1000)

model.fit(x_train, y_train)
#plt.rcParams["figure.figsize"] = (15,15)
#xgb.plot_importance(model)
#plt.show()

prediction = np.expm1(model.predict(test))
predictions["XGBoost"] = prediction

print(f"[XGBoost] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ... , {prediction[-1]}")

[XGBoost] => time: 2.2(sec)
1682026.125, 1520418.875, ... , 789814.4375


----
# III. Submission

In [15]:
import os

def name(integer):
    return str(integer).zfill(2)
    
savetime = dt.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"dataset/submissions/{folder}", exist_ok=True)
    submission.to_csv(f"dataset/submissions/{folder}/{sub_folder+' ('+model+')'}.csv", index=False)