In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('cut_market_info.csv')
data

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,month,dcoilwtico,city,state,type,cluster,family,class,perishable,holiday_type,locale,locale_name,description,transferred
0,2016-10-26,6,1489881,6.000,1.0,2016-10,48.75,Quito,Pichincha,D,13,PRODUCE,2018,1,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
1,2014-06-02,9,953609,7.000,0.0,2014-06,103.07,Quito,Pichincha,B,6,CLEANING,3046,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
2,2017-06-18,21,1946155,5.000,0.0,2017-06,44.73,Santo Domingo,Santo Domingo de los Tsachilas,B,6,DELI,2650,1,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
3,2016-07-09,46,119023,6.000,0.0,2016-07,45.37,Quito,Pichincha,A,14,CLEANING,3026,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
4,2016-08-04,43,760319,7.707,0.0,2016-08,41.92,Esmeraldas,Esmeraldas,E,10,POULTRY,2416,1,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191917,2015-08-25,17,220432,10.000,0.0,2015-08,39.15,Quito,Pichincha,C,12,GROCERY I,1080,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
191918,2014-07-03,30,890375,5.000,0.0,2014-07,104.76,Guayaquil,Guayas,C,3,GROCERY I,1002,0,Holiday,Local,El Carmen,Cantonizacion de El Carmen,Holiday
191919,2014-08-13,44,795610,38.000,0.0,2014-08,97.57,Quito,Pichincha,A,5,FROZEN FOODS,2220,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay
191920,2013-10-01,34,213788,3.000,,2013-10,102.09,Guayaquil,Guayas,B,6,GROCERY I,1096,0,WorkingDay,WorkingDay,WorkingDay,WorkingDay,WorkingDay


In [6]:
X = data.drop('unit_sales', axis = 1)
y = data['unit_sales']

#msle
y = y.apply(np.log1p)

X_train, X_test = X[X['date']<'2017-06-01'], X[X['date']>='2017-06-01']
y_train, y_test = y[y.index.isin(X_train.index)], y[y.index.isin(X_test.index)]

# уберем дату

X = X.drop('date', axis = 1)
X_train = X_train.drop('date', axis = 1)
X_test = X_test.drop('date', axis = 1)

object_cols = ['store_nbr', 'item_nbr', 'onpromotion', 'month',
               'city', 'state', 'type', 'cluster', 'family',
               'class', 'perishable', 'holiday_type', 'locale',
               'locale_name', 'description', 'transferred']

X[object_cols] = X[object_cols].astype(object)
X_train[object_cols] = X_train[object_cols].astype(object)
X_test[object_cols] = X_test[object_cols].astype(object)

In [7]:
from sklearn.model_selection import TimeSeriesSplit
splitter = TimeSeriesSplit(n_splits = 3)

In [8]:
!pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.8.1


In [9]:
# Запишем колонки для трансформации
cols_for_ohe = [x for x in X_train.columns if X_train[x].nunique()<5]
cols_for_mte = [x for x in X_train.columns if X_train[x].nunique()>=5]
numeric_cols = list(X_train.select_dtypes(exclude='object').columns)


# Запишем индексы этих колонок
cols_for_ohe_idx = [list(X_train.columns).index(col) for col in cols_for_ohe]
cols_for_mte_idx = [list(X_train.columns).index(col) for col in cols_for_mte]
numeric_cols_idx = [list(X_train.columns).index(col) for col in numeric_cols]

In [10]:
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Трансформации
t = [('OneHotEncoder',  OneHotEncoder(), cols_for_ohe_idx),
    ('MeanTargetEncoder',  TargetEncoder(), cols_for_mte_idx),
    ('StandardScaler',  StandardScaler(), numeric_cols_idx)]

col_transform = ColumnTransformer(transformers = t)

In [12]:
col_transform.fit(X_train, y_train)

In [13]:
col_transform.transform(X_train)

array([[ 1.        ,  0.        ,  0.        , ...,  1.74964584,
         1.74964584, -0.56429425],
       [ 0.        ,  1.        ,  0.        , ...,  1.74964584,
         1.74964584,  1.67987008],
       [ 0.        ,  1.        ,  0.        , ...,  1.74964584,
         1.74964584, -0.70393481],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  1.74964584,
         1.74964584,  1.45264431],
       [ 0.        ,  0.        ,  1.        , ...,  1.74964584,
         1.74964584,  1.63938258],
       [ 0.        ,  1.        ,  0.        , ...,  1.74964584,
         1.74964584, -0.63204884]])

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

pipe = Pipeline([('custom_transformer', col_transform),
                ('decision_tree', DecisionTreeRegressor())])
pipe.fit(X_train, y_train)

In [16]:
train_preds = pipe.predict(X_train)
test_preds = pipe.predict(X_test)

print(f'Качество на трейне: {round(np.mean((y_train - train_preds)**2),3)}')
print(f'Качество на тесте: {round(np.mean((y_test - test_preds)**2),3)}')

Качество на трейне: 0.0
Качество на тесте: 0.844
