In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

In [134]:
data = pd.read_csv('data/train.csv', parse_dates=["date"], index_col="id")
stores = pd.read_csv('data/stores.csv')
oil = pd.read_csv('data/oil.csv', parse_dates=["date"], index_col="date")
holidays = pd.read_csv('data/holidays_events.csv', parse_dates=["date"])
transactions = pd.read_csv('data/transactions.csv', parse_dates=["date"])

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000888 entries, 0 to 3000887
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   store_nbr    int64         
 2   family       object        
 3   sales        float64       
 4   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 137.4+ MB


# Baseline Models

In [54]:
pipe = make_pipeline(StandardScaler(), LinearRegression())

X = data[["date", "store_nbr", "onpromotion"]]
X.date = X.date.astype(np.int64)
y = np.log(data["sales"] + 1)

pipe.fit(X, y)
mean_squared_error(y, pipe.predict(X)) ** 0.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.date = X.date.astype(np.int64)


2.521559304103492

In [120]:
fam = pd.get_dummies(data.family).astype(np.int8)
dow = pd.get_dummies(data.date.dt.dayofweek, prefix="dow", drop_first=True).astype(np.int8)
moy = pd.get_dummies(data.date.dt.month, prefix="moy", drop_first=True).astype(np.int8)
year = data.date.dt.year.rename("year")
store = pd.get_dummies(data.store_nbr, prefix="store", drop_first=True).astype(np.int8)

X = pd.concat([fam, dow, moy, year, store], axis=1)
X["promo"] = data.onpromotion.astype(pd.SparseDtype(int, 0))

X.head()

Unnamed: 0_level_0,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,store_46,store_47,store_48,store_49,store_50,store_51,store_52,store_53,store_54,promo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000888 entries, 0 to 3000887
Columns: 105 entries, AUTOMOTIVE to promo
dtypes: Sparse[int64, 0](104), int32(1)
memory usage: 170.0 MB


In [84]:
lin_pipe = make_pipeline(StandardScaler(), LinearRegression())
lin_pipe.fit(X, y)
mean_squared_error(y, lin_pipe.predict(X)) ** 0.5



1.2805653570193123

In [85]:
{k: v for k, v in zip(X.columns, lin_pipe[1].coef_)}

{'AUTOMOTIVE': 41834550738.34978,
 'BABY CARE': 41834550738.08374,
 'BEAUTY': 41834550738.27025,
 'BEVERAGES': 41834550739.23952,
 'BOOKS': 41834550738.07938,
 'BREAD/BAKERY': 41834550739.0064,
 'CELEBRATION': 41834550738.3084,
 'CLEANING': 41834550739.1454,
 'DAIRY': 41834550739.04938,
 'DELI': 41834550738.91988,
 'EGGS': 41834550738.84508,
 'FROZEN FOODS': 41834550738.78463,
 'GROCERY I': 41834550739.30778,
 'GROCERY II': 41834550738.484886,
 'HARDWARE': 41834550738.16815,
 'HOME AND KITCHEN I': 41834550738.42145,
 'HOME AND KITCHEN II': 41834550738.39581,
 'HOME APPLIANCES': 41834550738.11501,
 'HOME CARE': 41834550738.62818,
 'LADIESWEAR': 41834550738.248856,
 'LAWN AND GARDEN': 41834550738.26497,
 'LINGERIE': 41834550738.355515,
 'LIQUOR,WINE,BEER': 41834550738.650246,
 'MAGAZINES': 41834550738.19885,
 'MEATS': 41834550738.942215,
 'PERSONAL CARE': 41834550738.93134,
 'PET SUPPLIES': 41834550738.219666,
 'PLAYERS AND ELECTRONICS': 41834550738.277054,
 'POULTRY': 41834550738.93394,

In [81]:
pipe = make_pipeline(MinMaxScaler(), XGBRegressor())
pipe.fit(X, y)
mean_squared_error(y, pipe.predict(X)) ** 0.5



0.7801639258117617

# Feature Augmentation

In [None]:
class Dummify:
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols, drop_first=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1703 entries, 0 to 1702
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype        
---  ------      --------------  -----        
 0   date        1703 non-null   datetime64[s]
 1   dcoilwtico  1703 non-null   float64      
dtypes: datetime64[s](1), float64(1)
memory usage: 26.7 KB


  oil = oil.join(oil_raw, on="date", how="left").fillna(method="bfill")


In [139]:
less.join(oil, on="date")

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion,dcoilwtico
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1090782,2014-09-06,15,AUTOMOTIVE,6.00,0,
1471740,2015-04-08,53,CELEBRATION,0.00,0,50.44
2942208,2017-07-14,12,PLAYERS AND ELECTRONICS,3.00,0,46.53
1080613,2014-08-31,29,POULTRY,0.00,0,
896300,2014-05-19,8,LAWN AND GARDEN,16.00,0,102.95
...,...,...,...,...,...,...
2126104,2016-04-11,14,GROCERY II,9.00,0,40.46
1761862,2015-09-18,43,PERSONAL CARE,167.00,0,44.71
859255,2014-04-29,19,BABY CARE,0.00,0,101.56
2127711,2016-04-12,1,BEVERAGES,1745.00,35,42.12


In [140]:
oil.index.min()

Timestamp('2013-01-01 00:00:00')