# Sample solutions

In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import cross_validation

## Load datasets

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [3]:
print('train dataset')
print(df_train.info())
print('test dataset')
print(df_test.info())

train dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74180464 entries, 0 to 74180463
Data columns (total 11 columns):
Semana               int64
Agencia_ID           int64
Canal_ID             int64
Ruta_SAK             int64
Cliente_ID           int64
Producto_ID          int64
Venta_uni_hoy        int64
Venta_hoy            float64
Dev_uni_proxima      int64
Dev_proxima          float64
Demanda_uni_equil    int64
dtypes: float64(2), int64(9)
memory usage: 6.1 GB
None
test dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6999251 entries, 0 to 6999250
Data columns (total 7 columns):
id             int64
Semana         int64
Agencia_ID     int64
Canal_ID       int64
Ruta_SAK       int64
Cliente_ID     int64
Producto_ID    int64
dtypes: int64(7)
memory usage: 373.8 MB
None


## Loss function (RMSLE)

In [None]:
def rmsle(predicted, expected):
    return np.sqrt(np.power(np.log(predicted+1) - np.log(expected+1), 2).sum()/len(expected))

## Select only columns which are both in training and test set.

In [11]:
COLUMNS = ['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
# COLUMNS = ['Semana', 'Producto_ID']
X = df_train[COLUMNS]
Y = df_train['Demanda_uni_equil']
(X_train, X_test, y_train, y_test) = cross_validation.train_test_split(X, Y, test_size=0.2)
X2 = df_test[COLUMNS]

## Median solution

In [28]:
demand = df_train['Demanda_uni_equil'].median()
solution = df_test[['id']].copy()
solution['Demanda_uni_equil'] = demand
solution.astype(int).to_csv('../data/solution.csv.gz', index=False, compression='gzip')

In [30]:
pd.read_csv('../data/solution.csv.gz').head()

Unnamed: 0,id,Demanda_uni_equil
0,0,3
1,1,3
2,2,3
3,3,3
4,4,3


## Random forest

In [8]:
model = DecisionTreeRegressor()
model.fit(X, Y)
Y2 = model.predict(X2)
solution = df_test[['id']].copy()
solution['Demanda_uni_equil'] = Y2
solution.astype(int).to_csv('../data/solution.csv.gz', index=False, compression='gzip')
solution.head()

Unnamed: 0,id,Demanda_uni_equil
0,0,4.0
1,1,1.0
2,2,2.0
3,3,1.0
4,4,1.0
