In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
data = pd.read_csv('DATA/c13_9_2_prep.csv')
data.drop(['Unnamed: 0'], axis=1,inplace = True)
data.head()

Unnamed: 0,ACC,CAMP,LVL,AWARD,LOA,STAFF,PREVSTAFF,BALS,GAS,PAS,BLCKDSLS,RETURNS
0,392,0,3,751.14,101,21,22,46982,46982,2246,44736.0,0.0
1,393,0,2,416.42,101,18,19,26603,26603,1712,24891.0,-2435.0
2,394,0,3,618.44,101,17,16,38605,38605,1765,36840.0,0.0
3,395,0,5,1221.7,101,25,27,122660,122660,1823,72924.0,-155.0
4,396,0,3,662.57,101,11,11,41806,41806,2422,39384.0,-953.0


In [4]:
data2 = data.copy()
data2['prev_level'] = data2.groupby(['ACC'])['LVL'].shift()
data2['prev_level_diff'] = data2.groupby(['ACC'])['prev_level'].diff()
data2 = data2.dropna()
data2.head()

Unnamed: 0,ACC,CAMP,LVL,AWARD,LOA,STAFF,PREVSTAFF,BALS,GAS,PAS,BLCKDSLS,RETURNS,prev_level,prev_level_diff
97340,0,1,0,57892.12,9,1079,1099,0,0,0,3476609.02,-42319.11,0.0,0.0
97341,0,1,0,57892.12,9,1079,1099,0,0,0,3476609.02,-42319.11,0.0,0.0
97342,1,1,0,104330.99,251,2287,2378,0,0,0,6263169.83,-281276.24,0.0,0.0
97343,1,1,0,104330.99,251,2287,2378,0,0,0,6263169.83,-281276.24,0.0,0.0
97344,2,1,0,60358.35,241,1626,1719,0,0,0,3622668.15,-68372.87,0.0,0.0


In [5]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [6]:
%%time
mean_error = []
for camp in range(3,13):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    p = val['prev_level'].values
    
    error = rmsle(val['LVL'].values, p)
    print('CAMP %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

CAMP 3 - Error 0.37869
CAMP 4 - Error 0.35622
CAMP 5 - Error 0.34900
CAMP 6 - Error 0.36981
CAMP 7 - Error 0.34832
CAMP 8 - Error 0.40694
CAMP 9 - Error 0.33812
CAMP 10 - Error 0.37532
CAMP 11 - Error 0.38535
CAMP 12 - Error 0.35772
Mean Error = 0.36655
Wall time: 261 ms


In [9]:
%%time
mean_error = []
for camp in range(3,16):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    xtr, xts = train.drop(['LVL'], axis=1), val.drop(['LVL'], axis=1)
    ytr, yts = train['LVL'].values, val['LVL'].values
    
    mdl = LGBMRegressor(n_estimators=1000, learning_rate=0.01)
    mdl.fit(xtr, np.log1p(ytr))
    
    p = np.expm1(mdl.predict(xts))
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 3 - Error 0.04406
Week 4 - Error 0.04821
Week 5 - Error 0.04546
Week 6 - Error 0.04516
Week 7 - Error 0.03456
Week 8 - Error 0.03620
Week 9 - Error 0.03425
Week 10 - Error 0.04191
Week 11 - Error 0.04251
Week 12 - Error 0.03294
Week 13 - Error 0.02501
Week 14 - Error 0.03316
Week 15 - Error 0.03368
Mean Error = 0.03824
Wall time: 1min 52s


In [10]:
%%time
mean_error = []
for camp in range(3,16):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    xtr, xts = train.drop(['LVL'], axis=1), val.drop(['LVL'], axis=1)
    ytr, yts = train['LVL'].values, val['LVL'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 3 - Error 0.03820
Week 4 - Error 0.02839
Week 5 - Error 0.02969
Week 6 - Error 0.03794
Week 7 - Error 0.02899
Week 8 - Error 0.02208
Week 9 - Error 0.01882
Week 10 - Error 0.03198
Week 11 - Error 0.02326
Week 12 - Error 0.01815
Week 13 - Error 0.01684
Week 14 - Error 0.02065
Week 15 - Error 0.02343
Mean Error = 0.02603
Wall time: 59min 57s
