In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [3]:
data = pd.read_csv('DATA/c13_12_2_prep.csv')
data.drop(['Unnamed: 0'], axis=1,inplace = True)
data.head()

Unnamed: 0,ACC,CAMP,LVL,AWARD,LOA,STAFF,PREVSTAFF,BALS,GAS,PAS,...,DL_T10,DL_T11,DL_T12,DL_T13,DL_T14,DL_T15,DL_T16,DL_T17,DL_T18,DL_T19
0,0,0,3,751.14,101,21,22,46982,46982,2246,...,0,0,0,0,0,0,0,0,0,0
1,1,0,2,416.42,101,18,19,26603,26603,1712,...,0,0,0,0,0,0,0,0,0,0
2,2,0,3,618.44,101,17,16,38605,38605,1765,...,0,0,0,0,0,0,0,0,0,0
3,3,0,5,1221.7,101,25,27,122660,122660,1823,...,0,0,0,0,0,0,0,0,0,0
4,4,0,3,662.57,101,11,11,41806,41806,2422,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data2 = data.copy()
data2['prev_level'] = data2.groupby(['ACC'])['LVL'].shift()
data2['prev_level_diff'] = data2.groupby(['ACC'])['prev_level'].diff()
data2 = data2.dropna()
data2.head()

Unnamed: 0,ACC,CAMP,LVL,AWARD,LOA,STAFF,PREVSTAFF,BALS,GAS,PAS,...,DL_T12,DL_T13,DL_T14,DL_T15,DL_T16,DL_T17,DL_T18,DL_T19,prev_level,prev_level_diff
96595,0,2,3,764.03,103,23,22,49798,49798,4598,...,0,0,0,0,0,0,0,0,3.0,0.0
96596,2,2,2,470.91,103,12,13,30254,30254,2572,...,0,0,0,0,0,0,0,0,2.0,-1.0
96597,3,2,5,937.03,103,24,26,100180,100180,2394,...,0,0,0,0,0,0,0,0,5.0,0.0
96598,4,2,0,25.42,103,1,10,7006,7006,5518,...,0,0,0,0,0,0,0,0,3.0,0.0
96599,5,2,5,1295.65,103,34,35,150198,150198,4734,...,0,0,0,0,0,0,0,0,5.0,0.0


In [5]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [6]:
%%time
mean_error = []
for camp in range(3,16):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    p = val['prev_level'].values
    
    error = rmsle(val['LVL'].values, p)
    print('CAMP %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

CAMP 3 - Error 0.38241
CAMP 4 - Error 0.35978
CAMP 5 - Error 0.35246
CAMP 6 - Error 0.37341
CAMP 7 - Error 0.35148
CAMP 8 - Error 0.41029
CAMP 9 - Error 0.34077
CAMP 10 - Error 0.37830
CAMP 11 - Error 0.38836
CAMP 12 - Error 0.36049
Mean Error = 0.36978
Wall time: 494 ms


In [7]:
%%time
mean_error = []
for camp in range(3,16):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    xtr, xts = train.drop(['LVL'], axis=1), val.drop(['LVL'], axis=1)
    ytr, yts = train['LVL'].values, val['LVL'].values
    
    mdl = LGBMRegressor(n_estimators=1000, learning_rate=0.01)
    mdl.fit(xtr, np.log1p(ytr))
    
    p = np.expm1(mdl.predict(xts))
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 3 - Error 0.04006
Week 4 - Error 0.04548
Week 5 - Error 0.04158
Week 6 - Error 0.03912
Week 7 - Error 0.02775
Week 8 - Error 0.02989
Week 9 - Error 0.03350
Week 10 - Error 0.03438
Week 11 - Error 0.03085
Week 12 - Error 0.02999
Week 13 - Error 0.01961
Week 14 - Error 0.02861
Week 15 - Error 0.02983
Mean Error = 0.03313
Wall time: 11min 24s


In [8]:
%%time
mean_error = []
for camp in range(3,16):
    train = data2[data2['CAMP'] < camp]
    val = data2[data2['CAMP'] == camp]
    
    xtr, xts = train.drop(['LVL'], axis=1), val.drop(['LVL'], axis=1)
    ytr, yts = train['LVL'].values, val['LVL'].values
    
    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (camp, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 3 - Error 0.03405
Week 4 - Error 0.02320
Week 5 - Error 0.02344
Week 6 - Error 0.02940
Week 7 - Error 0.01953
Week 8 - Error 0.01668
Week 9 - Error 0.00882
Week 10 - Error 0.02111
Week 11 - Error 0.01779
Week 12 - Error 0.01140
Week 13 - Error 0.01027
Week 14 - Error 0.01500
Week 15 - Error 0.01779
Mean Error = 0.01911
Wall time: 1h 54min 46s
