# Data load in and transformation

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
from sklearn.cross_validation import train_test_split

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200





train = pd.read_csv("../Data/train.csv")
test = pd.read_csv("../Data/test.csv")
loss = train["loss"]
train_id = train["id"]
test_id = test['id']

del train["loss"]
del train["id"]
del test["id"]

# remove NZV
catFeature = []
for i in range(1,116):
    catFeature.append("cat%i" % i)

ratio = []

for col in catFeature:
    max_ratio = train[col].value_counts().max()/len(train[col])
    ratio.append(max_ratio)

rmFeature = np.array(catFeature)[np.array(ratio)>=1]

for i in rmFeature:
    del train[i]
    del test[i]    

# reduce the number of categories in some cat variable
train["cat109"] = train["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)
test["cat109"] = test["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)

cat_110_domi = train.cat110.value_counts().index[(train.cat110.value_counts()/len(train.cat109))>0.01].values
train["cat110"] = train["cat110"].apply(lambda x: "other" if x not in cat_110_domi else x)
test["cat109"] = test["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)
cat_113_domi = train.cat113.value_counts().index[(train.cat113.value_counts()/len(train.cat113))>0.005].values
train["cat113"] = train["cat113"].apply(lambda x: "other" if x not in cat_113_domi else x)
test["cat109"] = test["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)

# assign train to X
X = train

# Categorize the continuous variables
breakpoint_normal = [-0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint2 = [-0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]
breakpoint4 = [-0.01,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint5 = [-0.01,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint8 = [-0.01,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint14 = [-0.01,0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]
breakpoint_special = [breakpoint2,breakpoint4,breakpoint5,breakpoint8,breakpoint14]

columns_normal_cut = ['cont1', 'cont3', 'cont6', 'cont7', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13',]
columns_special_cut = ['cont2','cont4','cont5','cont8','cont14']

# normal cut
for i in columns_normal_cut:
    X[i] = pd.cut(X[i],breakpoint_normal)
    test[i] = pd.cut(test[i],breakpoint_normal)

# special cut
k = 0
for i in columns_special_cut:
    X[i] = pd.cut(X[i],breakpoint_special[k])
    test[i] = pd.cut(test[i],breakpoint_special[k])

test.index = range(188318,188318+125546)
ALL = pd.concat([X,test],axis=0)

# transform the cats to dummy
obj = ALL.dtypes.index[ALL.dtypes.apply(lambda x: x in ["object","category"])]
for i in obj:
    new_cat = pd.get_dummies(ALL[i],prefix=i,drop_first=True)
    del ALL[i]
    ALL = pd.concat([ALL,new_cat],axis = 1)

X = ALL.ix[:188317,:]
test = ALL.ix[188318:,:]
del ALL

# check the shape

In [21]:
assert X.shape[1] == test.shape[1]

# Model

In [22]:
# build random forest
from sklearn.ensemble import RandomForestRegressor

rft = RandomForestRegressor(n_estimators=350, max_features="sqrt", criterion='mse',
                            n_jobs=-1, random_state=42, verbose=3)

In [23]:
y_log = np.log(loss)

In [24]:
# fit model
rft.fit(X,y_log)
# rft_pred = rft.predict(X_test)

building tree 1 of 350
building tree 2 of 350
building tree 3 of 350
building tree 4 of 350
building tree 5 of 350
building tree 6 of 350
building tree 7 of 350
building tree 8 of 350
building tree 9 of 350
building tree 10 of 350
building tree 11 of 350
building tree 12 of 350
building tree 13 of 350
building tree 14 of 350
building tree 15 of 350
building tree 16 of 350
building tree 17 of 350
building tree 18 of 350
building tree 19 of 350
building tree 20 of 350
building tree 21 of 350
building tree 22 of 350
building tree 23 of 350
building tree 24 of 350


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.5s


building tree 25 of 350
building tree 26 of 350
building tree 27 of 350
building tree 28 of 350
building tree 29 of 350
building tree 30 of 350
building tree 31 of 350
building tree 32 of 350
building tree 33 of 350
building tree 34 of 350
building tree 35 of 350
building tree 36 of 350
building tree 37 of 350
building tree 38 of 350
building tree 39 of 350
building tree 40 of 350
building tree 41 of 350
building tree 42 of 350
building tree 43 of 350
building tree 44 of 350
building tree 45 of 350
building tree 46 of 350
building tree 47 of 350
building tree 48 of 350
building tree 49 of 350
building tree 50 of 350
building tree 51 of 350
building tree 52 of 350
building tree 53 of 350
building tree 54 of 350
building tree 55 of 350
building tree 56 of 350
building tree 57 of 350
building tree 58 of 350
building tree 59 of 350
building tree 60 of 350
building tree 61 of 350
building tree 62 of 350
building tree 63 of 350
building tree 64 of 350
building tree 65 of 350
building tree 66

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.1min


building tree 121 of 350
building tree 122 of 350
building tree 123 of 350
building tree 124 of 350
building tree 125 of 350
building tree 126 of 350
building tree 127 of 350
building tree 128 of 350
building tree 129 of 350
building tree 130 of 350
building tree 131 of 350
building tree 132 of 350
building tree 133 of 350
building tree 134 of 350
building tree 135 of 350
building tree 136 of 350
building tree 137 of 350
building tree 138 of 350
building tree 139 of 350
building tree 140 of 350
building tree 141 of 350
building tree 142 of 350
building tree 143 of 350
building tree 144 of 350
building tree 145 of 350
building tree 146 of 350
building tree 147 of 350
building tree 148 of 350
building tree 149 of 350
building tree 150 of 350
building tree 151 of 350
building tree 152 of 350
building tree 153 of 350
building tree 154 of 350
building tree 155 of 350
building tree 156 of 350
building tree 157 of 350
building tree 158 of 350
building tree 159 of 350
building tree 160 of 350


[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.5min


building tree 282 of 350
building tree 283 of 350
building tree 284 of 350
building tree 285 of 350
building tree 286 of 350
building tree 287 of 350
building tree 288 of 350
building tree 289 of 350
building tree 290 of 350
building tree 291 of 350
building tree 292 of 350
building tree 293 of 350
building tree 294 of 350
building tree 295 of 350
building tree 296 of 350
building tree 297 of 350
building tree 298 of 350
building tree 299 of 350
building tree 300 of 350
building tree 301 of 350
building tree 302 of 350
building tree 303 of 350
building tree 304 of 350
building tree 305 of 350
building tree 306 of 350
building tree 307 of 350
building tree 308 of 350
building tree 309 of 350
building tree 310 of 350
building tree 311 of 350
building tree 312 of 350
building tree 313 of 350
building tree 314 of 350
building tree 315 of 350
building tree 316 of 350
building tree 317 of 350
building tree 318 of 350
building tree 319 of 350
building tree 320 of 350
building tree 321 of 350


[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:  3.2min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=350, n_jobs=-1, oob_score=False, random_state=42,
           verbose=3, warm_start=False)

In [25]:
# predict and output
y_pred_log = rft.predict(test)
y_pred = np.exp(y_pred_log)

[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    7.9s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:   18.3s
[Parallel(n_jobs=8)]: Done 350 out of 350 | elapsed:   20.2s finished


In [26]:
y_pred

array([ 1789.00838472,  1931.21138983,  7217.71386255, ...,  2457.66501717,
        1024.04463757,  3753.54493233])

In [27]:
len(y_pred)

125546

In [28]:
export = pd.DataFrame({"id":test_id,"loss":y_pred})

In [29]:
export.to_csv("../Sub/submission2.csv",index=False)