# Predict Future Sales in Russia

- https://www.kaggle.com/c/competitive-data-science-predict-future-sales/

### Data files
- item_categories.csv
- items.csv
- sales_train.csv
- sample_submission.csv
- shops.csv
- test.csv

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
input_prefix = 'data/'
#input_prefix = 'https://github.com/hjort/ai-labs/raw/master/jupyter/sales-russia/data/'

bzfile = '.bz2' # ''

In [57]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
#nrows=300, # somente primeiras linhas!
df_train = pd.read_csv(\
    input_prefix + 'sales_train.csv' + bzfile,
    parse_dates=['date'], date_parser=dateparse,
    dtype={'date_block_num': np.int8, 'shop_id': np.int8, 'item_id': np.int16, \
           'item_price': np.float32, 'item_cnt_day': np.int16},
    )
print('shape:', df_train.shape)
df_train.head()

shape: (2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1
1,2013-01-03,0,25,2552,899.0,1
2,2013-01-05,0,25,2552,899.0,-1
3,2013-01-06,0,25,2554,1709.05,1
4,2013-01-15,0,25,2555,1099.0,1


In [36]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int8
shop_id           int8
item_id           int16
item_price        float32
item_cnt_day      int16
dtypes: float32(1), int16(2), int8(2), object(1)
memory usage: 39.2+ MB


In [37]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935800.0,2935800.0,2935800.0,2935800.0,2935800.0
mean,14.57,33.002,10197.0,890.85,1.2426
std,9.423,16.227,6324.3,1729.8,2.6188
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [22]:
df_train['item_price'] = df_train['item_price'].apply(lambda x: int(x * 1000)).astype(int)
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,500.0,500.0,500.0,500.0,500.0
mean,0.0,25.068,2807.1,91738.492,1.236
std,0.0,1.5205,1013.2699,79569.6557,0.9768
min,0.0,25.0,1205.0,5800.0,-1.0
25%,0.0,25.0,2439.0,34900.0,1.0
50%,0.0,25.0,2835.0,59900.0,1.0
75%,0.0,25.0,3175.0,121150.0,1.0
max,0.0,59.0,22154.0,549000.0,13.0


In [38]:
df_train['item_price2'] = df_train['item_price'] * 1000
df_train[df_train['item_price2'] - df_train['item_price2'].astype(int) > 0].head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_price2
438,02.01.2013,0,25,3186,499.1667,3,499170.0
530,09.01.2013,0,25,1471,2598.8333,3,2598800.0
2879,10.01.2013,0,24,19790,1038.8,1,1038800.0
4334,07.01.2013,0,25,13417,652.3333,3,652330.0
5647,22.01.2013,0,25,8093,1349.1666,3,1349200.0


In [46]:
#%time df_train['item_price2'] = df_train['item_price'].apply(lambda x: np.ceil(x)).astype(int)
%time df_train['item_price2'] = (np.ceil(df_train['item_price'])).astype(int)
df_train[df_train['item_price2'] - df_train['item_price'] > 0].head()

CPU times: user 76 ms, sys: 56 ms, total: 132 ms
Wall time: 206 ms


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_price2
3,06.01.2013,0,25,2554,1709.05,1,1710
26,06.01.2013,0,25,2548,1708.95,1,1709
34,12.01.2013,0,25,2715,898.5,1,899
80,25.01.2013,0,25,2857,1998.5,1,1999
88,20.01.2013,0,25,2798,448.5,1,449


## Sales (testing)

In [59]:
df_test = pd.read_csv(
    input_prefix + 'test.csv' + bzfile, #nrows=10000,
    index_col='ID')
print('shape:', df_test.shape)
df_test.head()

shape: (214200, 2)


Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [60]:
#TODO: verificar o que está sendo esperado (acumulado mensal por loja e produto?)
# item_cnt_day: number of products sold. You are predicting a monthly amount of this measure

In [61]:
for col in ['shop_id', 'item_id']:
    df_test[col] = df_test[col].astype('category')

In [62]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 2 columns):
shop_id    214200 non-null category
item_id    214200 non-null category
dtypes: category(2)
memory usage: 2.4 MB


In [63]:
df_test.describe()

Unnamed: 0,shop_id,item_id
count,214200,214200
unique,42,5100
top,59,22167
freq,5100,42


In [64]:
# criar dataframe sem quantidade de itens
df_zeroed = df_test.copy()
df_zeroed['key'] = 0
df_zeroed = pd.merge(df_zeroed,
                     pd.DataFrame({'date_block_num': np.arange(0, 34), 'key': np.zeros(34, dtype=int)}),
                     how='left', on='key').\
                set_index(['shop_id', 'item_id', 'date_block_num']).\
                drop(['key'], axis=1)
df_zeroed.head()

shop_id,item_id,date_block_num
5,5037,0
5,5037,1
5,5037,2
5,5037,3
5,5037,4


In [65]:
df_zeroed.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Empty DataFrame

## Sales (training)

In [75]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
df_train = pd.read_csv(\
    input_prefix + 'sales_train.csv' + bzfile,
    #nrows=300, # somente primeiras linhas!
    dtype={'date_block_num': np.int8, 'shop_id': np.int8, 'item_id': np.int16,
           'item_price': np.float32, 'item_cnt_day': np.float32},
    parse_dates=['date'], date_parser=dateparse)
print('shape:', df_train.shape)
df_train.head()

shape: (2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [76]:
#for col in ('shop_id', 'item_id'):
#    df_train[col] = df_train[col].astype('category')

In [77]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              datetime64[ns]
date_block_num    int8
shop_id           int8
item_id           int16
item_price        float32
item_cnt_day      float32
dtypes: datetime64[ns](1), float32(2), int16(1), int8(2)
memory usage: 56.0 MB


In [78]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935800.0,2935800.0,2935800.0,2935800.0,2935800.0
mean,14.57,33.002,10197.0,890.85,1.2426
std,9.423,16.227,6324.3,1729.8,2.6188
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [79]:
# calcular quantidade total de vendas para cada produto e loja por mês
df_sumitems = (df_train.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum()).to_frame()
df_sumitems.columns = ['item_cnt_month']
df_sumitems.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month
shop_id,item_id,date_block_num,Unnamed: 3_level_1
0,30,1,31.0
0,31,1,11.0
0,32,0,6.0
0,32,1,10.0
0,33,0,3.0


In [80]:
del(df_train)

In [81]:
df_zeroed.info()
df_zeroed.index.names

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Empty DataFrame

FrozenList(['shop_id', 'item_id', 'date_block_num'])

In [82]:
df_zeroed.head()

shop_id,item_id,date_block_num
5,5037,0
5,5037,1
5,5037,2
5,5037,3
5,5037,4


In [83]:
df_sumitems.info()
df_sumitems.index.names

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1609124 entries, (0, 30, 1) to (59, 22167, 17)
Data columns (total 1 columns):
item_cnt_month    1609124 non-null float32
dtypes: float32(1)
memory usage: 12.4 MB


FrozenList(['shop_id', 'item_id', 'date_block_num'])

In [84]:
# mesclar dataframes e zerar valores nulos
df_train2 = pd.merge(df_zeroed,
                     df_sumitems,
                     how='left', left_index=True, right_index=True).\
                fillna(0.0)
df_train2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month
shop_id,item_id,date_block_num,Unnamed: 3_level_1
5,5037,0,0.0
5,5037,1,0.0
5,5037,2,0.0
5,5037,3,0.0
5,5037,4,0.0


In [85]:
df_train2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Data columns (total 1 columns):
item_cnt_month    float32
dtypes: float32(1)
memory usage: 311.7 MB


In [86]:
del(df_sumitems)

In [87]:
#df_shops['city_code'] = df_shops['city_code'].astype('category')
df_shops['shop_id'] = df_shops['shop_id'].astype('category')

KeyError: 'shop_id'

In [88]:
df_shops.reset_index(inplace=True)

In [89]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 4 columns):
shop_id      60 non-null int64
shop_name    60 non-null object
city_name    60 non-null object
city_code    60 non-null category
dtypes: category(1), int64(1), object(2)
memory usage: 2.3+ KB


In [90]:
df_train2.reset_index().head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
shop_id           5 non-null category
item_id           5 non-null category
date_block_num    5 non-null int64
item_cnt_month    5 non-null float32
dtypes: category(2), float32(1), int64(1)
memory usage: 169.3 KB


In [91]:
# join items + categories:item_category_id => group_code, subgroup_code
df_train3 = pd.merge(df_train2.reset_index(),
                     df_shops[['shop_id', 'city_code']],
                     how='left', on='shop_id')
df_train3.head()

ValueError: You are trying to merge on category and int64 columns. If you wish to proceed you should use pd.concat

In [75]:
del(df_train2)
#del(df_shops)

In [90]:
df_train3['item_id'] = df_train3['item_id'].astype('int64')

In [91]:
df_train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7282800 entries, 0 to 7282799
Data columns (total 5 columns):
shop_id           int64
item_id           int64
date_block_num    int64
item_cnt_month    float64
city_code         category
dtypes: category(1), float64(1), int64(3)
memory usage: 284.8 MB


In [92]:
df_train3.describe(include='category')

Unnamed: 0,city_code
count,7282800
unique,28
top,15
freq,1213800


In [93]:
df_items2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 9 columns):
item_name             22170 non-null object
item_category_id      22170 non-null category
subject_name          22170 non-null object
subject_code          22170 non-null category
item_category_name    22170 non-null object
group_name            22170 non-null object
subgroup_name         22170 non-null object
group_code            22170 non-null category
subgroup_code         22170 non-null category
dtypes: category(4), object(5)
memory usage: 884.6+ KB


In [94]:
df_items2.reset_index().head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 10 columns):
item_id               5 non-null int64
item_name             5 non-null object
item_category_id      5 non-null category
subject_name          5 non-null object
subject_code          5 non-null category
item_category_name    5 non-null object
group_name            5 non-null object
subgroup_name         5 non-null object
group_code            5 non-null category
subgroup_code         5 non-null category
dtypes: category(4), int64(1), object(5)
memory usage: 170.3+ KB


In [95]:
# join sales + items:item_id => subject_code, (group_code, subgroup_code)
df_train4 = pd.merge(df_train3,
                     df_items2.drop(['item_name', 'subject_name', 'item_category_name',
                                     'group_name', 'subgroup_name'], axis=1),
                     how='left', on='item_id')
df_train4.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_code,item_category_id,subject_code,group_code,subgroup_code
0,5,5037,0,0.0,5,19,793,6,13
1,5,5037,1,0.0,5,19,793,6,13
2,5,5037,2,0.0,5,19,793,6,13
3,5,5037,3,0.0,5,19,793,6,13
4,5,5037,4,0.0,5,19,793,6,13


In [96]:
del(df_train3)
#del(df_items2)

In [97]:
for col in ['shop_id', 'item_id']:
    df_train4[col] = df_train4[col].astype('category')

In [98]:
# criar dataframe com possíveis meses
df_dates = pd.DataFrame({'date': pd.date_range(start='2013-01-01', periods=35, freq='M')})
df_dates.index.names = ['date_block_num']
df_dates.head()

Unnamed: 0_level_0,date
date_block_num,Unnamed: 1_level_1
0,2013-01-31
1,2013-02-28
2,2013-03-31
3,2013-04-30
4,2013-05-31


In [99]:
df_dates.tail()

Unnamed: 0_level_0,date
date_block_num,Unnamed: 1_level_1
30,2015-07-31
31,2015-08-31
32,2015-09-30
33,2015-10-31
34,2015-11-30


In [100]:
# criar colunas ano e mês
df_dates['year'] = df_dates['date'].dt.year
df_dates['month'] = df_dates['date'].dt.month
df_dates.head()

Unnamed: 0_level_0,date,year,month
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2013-01-31,2013,1
1,2013-02-28,2013,2
2,2013-03-31,2013,3
3,2013-04-30,2013,4
4,2013-05-31,2013,5


In [None]:
#TODO: baixar calendário de feriados na Rússia:
#      https://www.google.com/search?q=holidays+calendar+in+russia+format%3Acsv
#TODO: criar colunas indicando feriado
#TODO: criar colunas indicando véspera ou pós-feriado

In [101]:
df_train5 = pd.merge(df_train4,
                     df_dates.drop(['date'], axis=1),
                     how='left', on='date_block_num')
df_train5.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_code,item_category_id,subject_code,group_code,subgroup_code,year,month
0,5,5037,0,0.0,5,19,793,6,13,2013,1
1,5,5037,1,0.0,5,19,793,6,13,2013,2
2,5,5037,2,0.0,5,19,793,6,13,2013,3
3,5,5037,3,0.0,5,19,793,6,13,2013,4
4,5,5037,4,0.0,5,19,793,6,13,2013,5


In [102]:
#del(df_dates)

In [104]:
df_train5.info()

MemoryError: 

In [None]:
#TODO: treino: todos os meses, exceto o último / teste: último mês
#TODO: avaliar diversos algoritmos de regressão e escolher o de melhor escore (métrica: MSE)

In [None]:
#TODO: usando o melhor algoritmo avaliado, treinar novamente usando todos os meses disponíveis
#TODO: montar dataframe contendo lojas e itens presentes na base de testes (test.csv)
#TODO: usar todas as datas do mês seguinte (01/11/2015 a 30/11/2015)
#TODO: incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
#TODO: incrementar dataframe produzindo campos extras de datas (year, month, day, dow, woy)

In [None]:
#TODO: submeter o dataframe de entrada ao modelo treinado a fim de obter previsões
#TODO: calcular somas das quantidades de produtos agrupadas por loja e produto
#TODO: fazer JOIN da base de testes com esse dataframe final, produzindo o arquivo de submissão final

In [105]:
df_train_final = df_train5
df_train_final.columns

Index(['shop_id', 'item_id', 'date_block_num', 'item_cnt_month', 'city_code',
       'item_category_id', 'subject_code', 'group_code', 'subgroup_code',
       'year', 'month'],
      dtype='object')

In [106]:
# gravar dados de treino em arquivo
%time df_train_final.to_csv('train-data.csv', index_label='id')

In [107]:
data = df_train_final
#data = df_train_final.sample(frac=1.0).head(20000)

In [108]:
# definir dados de entrada
X = data.drop(['item_cnt_month'], axis=1) # tudo, exceto a coluna alvo
y = data['item_cnt_month'] # apenas a coluna alvo

print('Forma dos dados originais:', X.shape, y.shape)

MemoryError: 

## Treinamento dos modelos preditivos

In [None]:
MSE = 'neg_mean_squared_error'

models = {}

parallel_jobs = 2#8 # paralelização

In [None]:
from datetime import datetime

# avalia o desempenho do modelo, retornando o valor do MSE
def evaluate_model_cv(name, model, X=X, y=y):
    start = datetime.now()
    kfold = KFold(n_splits=10, random_state=42)
    results = cross_val_score(model, X, y, cv=kfold, scoring=MSE, verbose=1, n_jobs=parallel_jobs)
    end = datetime.now()
    elapsed = int((end - start).total_seconds() * 1000)
    score = (-1) * results.mean()
    stddev = results.std()
    models[name] = {'model': model, 'score': score, 'stddev': stddev, 'elapsed': elapsed}
    print(model, '\nScore: %.2f (+/- %.2f) [%5s ms]' % (score, stddev, elapsed))
    return score, stddev, elapsed

In [None]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):
  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  kfold = KFold(n_splits=10, random_state=42)
  grid = GridSearchCV(estimator=model, param_grid=params, scoring=MSE, cv=kfold, verbose=1, n_jobs=parallel_jobs)
  grid.fit(X, y)
  print('\nGrid Best Score: %.2f' % (grid.best_score_ * (-1)))
  print('Best Params:', grid.best_params_)
  return grid

### Avaliação e ajuste fino de cada modelo preditivo

- https://scikit-learn.org/stable/modules/classes.html

In [None]:
model = KNeighborsRegressor(n_jobs=-1, n_neighbors=11, weights='distance')
%time evaluate_model_cv('KNN', model)

#n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’,
#metric_params=None, n_jobs=None

params = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance']
}
#fine_tune_model(model, params)

In [None]:
model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=100, max_depth=3)
%time evaluate_model_cv('RF', model)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
#verbose=0, warm_start=False

params = {
    'n_estimators': [5, 10, 25, 50, 75, 100],
    'max_depth': [None, 3, 5, 7, 9, 11, 13]
}
#fine_tune_model(model, params)

In [None]:
model = GradientBoostingRegressor(random_state=42,
    learning_rate=0.05, n_estimators=100, max_depth=4, max_features=0.85)
%time evaluate_model_cv('GB', model)

#loss=’ls’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, 
#min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
#max_leaf_nodes=None, warm_start=False, presort=’auto’, validation_fraction=0.1, n_iter_no_change=None, 
#tol=0.0001

params = dict(
    n_estimators=[100, 250, 500],
    max_features=[0.75, 0.85, 1.0],
    max_depth=[4, 8, 12],
    learning_rate=[0.05, 0.1, 0.15],
    #subsample=[0.4, 0.6, 0.8]
)
#fine_tune_model(model, params)

In [None]:
model = ExtraTreesRegressor(random_state=42, n_jobs=-1, n_estimators=75, max_features=0.85)
%time evaluate_model_cv('ET', model)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0,
#warm_start=False

params = dict(
    n_estimators=[50, 75, 100, 200],
    max_features=['auto', 0.75, 0.85, 1.0]
)
#fine_tune_model(model, params)

In [None]:
model = BaggingRegressor(random_state=42, n_jobs=-1, base_estimator=DecisionTreeRegressor(), max_features=0.5, n_estimators=200)
%time evaluate_model_cv('BG', model)

#base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
#bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0

params = dict(
    n_estimators=[50, 75, 100, 200],
    max_features=[0.5, 0.75, 1.0]
)
#fine_tune_model(model, params)

In [None]:
model = AdaBoostRegressor(random_state=42, n_estimators=100, base_estimator=DecisionTreeRegressor())
%time evaluate_model_cv('ABDT', model)

# base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm=’SAMME.R’, random_state=None

params = dict(
    n_estimators=[50, 75, 100, 200]
)
#fine_tune_model(model, params)

## Comparação final entre os algoritmos

In [None]:
results = []
names = []
scores = []
stddevs = []
times = []

best_model = None
highest_score = None

for name, model in models.items():
    score, stddev, elapsed = model['score'], model['stddev'], model['elapsed']
    results.append((score, stddev))
    names.append(name)
    scores.append(score)
    stddevs.append(stddev)
    times.append(elapsed)
    
    if not highest_score or score < highest_score:
        best_model = model['model']
        highest_score = score

In [None]:
results_df = pd.DataFrame({'Model Name': names, 'Score': scores, 'Std Dev': stddevs, 'Time (ms)': times})
results_df.sort_values(by=['Score', 'Time (ms)'], ascending=[True, False])

In [None]:
print('Best Model:\n\n%s' % (best_model))

In [None]:
model = best_model
model.fit(X, y)

In [None]:
X.head()

In [None]:
del(X)
del(y)

## ....

In [109]:
df_test.head()

Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [110]:
df_test2 = df_test.copy()

In [111]:
# definir campo fixo
df_test2['date_block_num'] = 34 # Novembro/2015
df_test2.head()

Unnamed: 0_level_0,shop_id,item_id,date_block_num
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,5037,34
1,5,5320,34
2,5,5233,34
3,5,5232,34
4,5,5268,34


In [112]:
# incrementar colunas de data
df_test3 = pd.merge(df_test2,
                    df_dates.drop(['date'], axis=1),
                    how='left', on='date_block_num')
df_test3.head()

Unnamed: 0,shop_id,item_id,date_block_num,year,month
0,5,5037,34,2015,11
1,5,5320,34,2015,11
2,5,5233,34,2015,11
3,5,5232,34,2015,11
4,5,5268,34,2015,11


In [115]:
del(df_test2)

In [114]:
# incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
# incluir coluna com código da cidade
df_test4 = pd.merge(df_test3,
                    df_shops[['shop_id', 'city_code']],
                    how='left', on='shop_id')
df_test4.head()
#del(df_test2)

Unnamed: 0,shop_id,item_id,date_block_num,year,month,city_code
0,5,5037,34,2015,11,5
1,5,5320,34,2015,11,5
2,5,5233,34,2015,11,5
3,5,5232,34,2015,11,5
4,5,5268,34,2015,11,5


In [116]:
del(df_test3)

In [120]:
df_test4['item_id'] = df_test4['item_id'].astype('int64')

In [121]:
df_test5 = pd.merge(df_test4,
                     df_items2.drop(['item_name', 'subject_name', 'item_category_name',
                                     'group_name', 'subgroup_name'], axis=1),
                     how='left', on='item_id')
df_test5.head()

Unnamed: 0,shop_id,item_id,date_block_num,year,month,city_code,item_category_id,subject_code,group_code,subgroup_code
0,5,5037,34,2015,11,5,19,793,6,13
1,5,5320,34,2015,11,5,55,818,13,6
2,5,5233,34,2015,11,5,19,786,6,13
3,5,5232,34,2015,11,5,23,786,6,19
4,5,5268,34,2015,11,5,20,786,6,14


In [122]:
del(df_test4)

In [123]:
for col in ['shop_id', 'item_id']:
    df_test5[col] = df_test5[col].astype('category')

In [124]:
df_test_final = df_test5
df_test_final.columns

Index(['shop_id', 'item_id', 'date_block_num', 'year', 'month', 'city_code',
       'item_category_id', 'subject_code', 'group_code', 'subgroup_code'],
      dtype='object')

In [125]:
df_test_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 10 columns):
shop_id             214200 non-null category
item_id             214200 non-null category
date_block_num      214200 non-null int64
year                214200 non-null int64
month               214200 non-null int64
city_code           214200 non-null category
item_category_id    214200 non-null category
subject_code        214200 non-null category
group_code          214200 non-null category
subgroup_code       214200 non-null category
dtypes: category(7), int64(3)
memory usage: 8.7 MB


In [126]:
# colocar as colunas na mesma ordem dos dados de treino
cols = df_train_final.columns.drop(['item_cnt_month'])
cols

Index(['shop_id', 'item_id', 'date_block_num', 'city_code', 'item_category_id',
       'subject_code', 'group_code', 'subgroup_code', 'year', 'month'],
      dtype='object')

In [127]:
# selecionar atributos para o modelo
X_pred = df_test_final[cols]
#X_pred = df_test4.drop(['ID'], axis=1)

In [128]:
# gravar dados de teste em arquivo
X_pred.to_csv('test-data.csv', index_label='id')

In [None]:
# realizar a previsão
y_pred = best_model.predict(X_pred)

In [None]:
X_pred.info()
X_pred.head()

In [None]:
y_pred[:20]

In [None]:
df_final = pd.DataFrame(data=y_pred, columns=['item_cnt_month'], index=df_test4.index)
df_final.index.names = ['ID']
df_final.head()

In [None]:
df_final.info()

In [None]:
!test -d submissions/ || mkdir submissions/

In [None]:
df_final.to_csv('submissions/submission.csv')

## Sample submission

In [None]:
df_sample = pd.read_csv(input_prefix + 'sample_submission.csv', index_col='ID')
print('shape:', df_sample.shape)
df_sample.head()

In [None]:
df_sample.info()

In [None]:
del(df_sample)