In [55]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [56]:
data = pd.read_csv('DATA/c13_9_prep.csv')
data.drop(['Unnamed: 0'], axis=1,inplace = True)
data.head()

Unnamed: 0,LEVEL,AWARD,LOA,STAFF,PREVSTAFF,RETURNS,ID,CAMP
0,3,694.86,104,20,23,0.0,392,0
1,3,655.97,104,11,12,0.0,393,0
2,5,1197.7,104,27,24,-129.0,394,0
3,0,28.45,104,2,1,0.0,395,0
4,5,1461.05,104,37,34,0.0,396,0


In [57]:
data2 = data.copy()
data2['prev_level'] = data2.groupby(['ID'])['LEVEL'].shift()
data2['prev_level_diff'] = data2.groupby(['ID'])['prev_level'].diff()
data2 = data2.dropna()
data2.head()

Unnamed: 0,LEVEL,AWARD,LOA,STAFF,PREVSTAFF,RETURNS,ID,CAMP,prev_level,prev_level_diff
90125,0,75761.46,12,1236,1200,-22999.26,0,1,0.0,0.0
90126,0,75761.46,12,1236,1200,-22999.26,0,1,0.0,0.0
90127,0,137812.11,254,2487,2453,-385459.12,1,1,0.0,0.0
90128,0,137812.11,254,2487,2453,-385459.12,1,1,0.0,0.0
90129,0,93934.84,244,1985,1803,-78393.03,2,1,0.0,0.0


In [58]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [64]:
mean_error = []
for camp in range(3,8):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    p = val['prev_level'].values
    
    error = rmsle(val['LEVEL'].values, p)
    print('CAMP %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

CAMP 3 - Error 0.36981
CAMP 4 - Error 0.34832
CAMP 5 - Error 0.40694
CAMP 6 - Error 0.33812
CAMP 7 - Error 0.37527
Mean Error = 0.36769


In [61]:
%%time
mean_error = []
for camp in range(3,13):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    xtr, xts = train.drop(['LEVEL'], axis=1), val.drop(['LEVEL'], axis=1)
    ytr, yts = train['LEVEL'].values, val['LEVEL'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 3 - Error 0.26457
Week 4 - Error 0.27246
Week 5 - Error 0.31232
Week 6 - Error 0.26438
Week 7 - Error 0.25800
Week 8 - Error 0.31326
Week 9 - Error 0.27062
Week 10 - Error 0.26360
Week 11 - Error 0.27015
Week 12 - Error 0.27397
Mean Error = 0.27633
Wall time: 46min 22s


In [62]:
%%time
data4 = data.copy()
data4['prev_level'] = data4.groupby(['ID'])['LEVEL'].shift()
data4['prev_level_diff'] = data4.groupby(['ID'])['prev_level'].diff()
data4['prev-1_level'] = data4.groupby(['ID'])['LEVEL'].shift(2)
data4['prev-1_level_diff'] = data4.groupby(['ID'])['prev-1_level'].diff()
data4['prev-2_level'] = data4.groupby(['ID'])['LEVEL'].shift(3)
data4['prev-2_level_diff'] = data4.groupby(['ID'])['prev-2_level'].diff()
data4['prev-3_level'] = data4.groupby(['ID'])['LEVEL'].shift(4)
data4['prev-3_level_diff'] = data4.groupby(['ID'])['prev-3_level'].diff()
data4['prev-4_level'] = data4.groupby(['ID'])['LEVEL'].shift(5)
data4['prev-4_level_diff'] = data4.groupby(['ID'])['prev-4_level'].diff()
data4['prev-5_level'] = data4.groupby(['ID'])['LEVEL'].shift(6)
data4['prev-5_level_diff'] = data4.groupby(['ID'])['prev-5_level'].diff()
data4 = data4.dropna()
data4.head()

Wall time: 1min 16s


In [63]:
%%time
mean_error = []
for camp in range(4,13):
    train = data4[data4['CAMP'] < camp]
    val = data4[data4['CAMP'] == camp]
    
    xtr, xts = train.drop(['LEVEL'], axis=1), val.drop(['LEVEL'], axis=1)
    ytr, yts = train['LEVEL'].values, val['LEVEL'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('CAMP %d - Error %.5f' % (camp, error))
    mean_error.append(error)

print('Mean Error = %.5f' % np.mean(mean_error))

CAMP 4 - Error 0.00000
CAMP 5 - Error 0.00000
CAMP 6 - Error 0.00000
CAMP 7 - Error 1.60694
CAMP 8 - Error 0.35833
CAMP 9 - Error 0.31676
CAMP 10 - Error 0.31521
CAMP 11 - Error 0.31769
CAMP 12 - Error 0.32197
Mean Error = 0.35966
Wall time: 12min 37s
