# Train test split

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import cross_validation

def rmsle(predicted, expected):
    a = np.log(predicted+1)
    b = np.log(expected+1)
    c = np.power(a - b, 2)
    return np.sqrt(c.sum()/len(expected))

# rmsle(np.full(6, 3.0), np.full(6, 2.0))

## Load datasets

In [2]:
%time df_train = pd.read_csv('../data/train.csv')
df_train.info()

Wall time: 59.9 s
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74180464 entries, 0 to 74180463
Data columns (total 11 columns):
Semana               int64
Agencia_ID           int64
Canal_ID             int64
Ruta_SAK             int64
Cliente_ID           int64
Producto_ID          int64
Venta_uni_hoy        int64
Venta_hoy            float64
Dev_uni_proxima      int64
Dev_proxima          float64
Demanda_uni_equil    int64
dtypes: float64(2), int64(9)
memory usage: 6.1 GB


## Split to train and test set

In [3]:
COLUMNS = ['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
X = df_train[COLUMNS]
Y = df_train['Demanda_uni_equil']
(X_train, X_test, y_train, y_test) = cross_validation.train_test_split(X, Y, test_size=0.1)

## Median solution

In [5]:
predictions = np.full(len(y_test), y_train.median())
print('RMSLE: %f' % rmsle(y_test, predictions))

RMSLE: 0.861096


## Random forest

In [None]:
model = DecisionTreeRegressor()
%time model.fit(X_train, y_train)
predictions = model.predict(X_test)
print('RMSLE: %f' % rmsle(y_test, predictions))