# Train test split

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

def rmsle(predicted, expected):
    a = np.log(predicted+1)
    b = np.log(expected+1)
    c = np.power(a - b, 2)
    return np.sqrt(c.sum()/len(expected))

# rmsle(np.full(6, 3.0), np.full(6, 2.0))

## Load datasets

In [2]:
%time df = pd.read_csv('../data/train.csv')
df_train = df[df.Semana < 9]
df_test = df[df.Semana == 9]
del df
print('train size: %d, Test size: %d' % (len(df_train), len(df_test)))

Wall time: 55 s
train size: 63771751, Test size: 10408713


In [3]:
FEATURE_COLUMNS = ['Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
OUTPUT_COLUMN = 'Demanda_uni_equil'
X_train = df_train[FEATURE_COLUMNS]
y_train = df_train[OUTPUT_COLUMN]

## Mean solution

In [10]:
predictions = np.full(len(df_test), y_train.median())
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.864640


In [11]:
predictions = np.full(len(df_test), y_train.mean())
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.983407


### Median by Agencia_ID

In [6]:
mean = y_train.median()
xs = df_train.groupby(['Agencia_ID'])[OUTPUT_COLUMN].median()
predictions = df_test['Agencia_ID'].map(xs).fillna(mean)
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.796566


### Median by Canal_ID

In [7]:
mean = y_train.median()
xs = df_train.groupby(['Canal_ID'])[OUTPUT_COLUMN].median()
predictions = df_test['Canal_ID'].map(xs).fillna(mean)
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.798390


## Median by Ruta_SAK

In [8]:
mean = y_train.median()
xs = df_train.groupby(['Ruta_SAK'])[OUTPUT_COLUMN].median()
predictions = df_test['Ruta_SAK'].map(xs).fillna(mean)
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.797138


### Median by Cliente_ID

In [9]:
mean = y_train.median()
xs = df_train.groupby(['Cliente_ID'])[OUTPUT_COLUMN].median()
predictions = df_test['Cliente_ID'].map(xs).fillna(mean)
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.735369


### Median by Producto_ID

In [17]:
mean = y_train.median()
xs = df_train.groupby(['Producto_ID'])[OUTPUT_COLUMN].median()
predictions = df_test['Producto_ID'].map(xs).fillna(mean)
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

RMSLE: 0.692983


### Median by product and client

In [13]:
# mean = y_train.median()
# df_train.groupby(['Producto_ID', 'Cliente_ID'])[OUTPUT_COLUMN].median()
# df_train[df_train['Cliente_ID'] == 681747 or][OUTPUT_COLUMN]

## Random forest

In [5]:
FEATURE_COLUMNS = ['Cliente_ID', 'Producto_ID']
model = DecisionTreeRegressor()
%time model.fit(df_train[FEATURE_COLUMNS], y_train)
predictions = model.predict(df_test[FEATURE_COLUMNS])
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))

Wall time: 7min 27s
RMSLE: 0.541440


## SVM

In [None]:
svr = SVR(kernel='rbf', C=1e3, gamma=0.1)
%time svr_model = svr.fit(df_train[FEATURE_COLUMNS], y_train)
predictions = svr_model.predict(X)
print('RMSLE: %f' % rmsle(df_test[OUTPUT_COLUMN], predictions))