# Data load in and transformation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
from sklearn.cross_validation import train_test_split

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200





train = pd.read_csv("../Data/train.csv")
test = pd.read_csv("../Data/test.csv")
loss = train["loss"]
train_id = train["id"]
test_id = test['id']

del train["loss"]
del train["id"]
del test["id"]

# remove NZV
catFeature = []
for i in range(1,116):
    catFeature.append("cat%i" % i)

ratio = []

for col in catFeature:
    max_ratio = train[col].value_counts().max()/len(train[col])
    ratio.append(max_ratio)

rmFeature = np.array(catFeature)[np.array(ratio)>=1]

for i in rmFeature:
    del train[i]
    del test[i]    

# reduce the number of categories in some cat variable
train["cat109"] = train["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)
test["cat109"] = test["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)

cat_110_domi = train.cat110.value_counts().index[(train.cat110.value_counts()/len(train.cat109))>0.01].values
train["cat110"] = train["cat110"].apply(lambda x: "other" if x not in cat_110_domi else x)
test["cat109"] = test["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)
cat_113_domi = train.cat113.value_counts().index[(train.cat113.value_counts()/len(train.cat113))>0.005].values
train["cat113"] = train["cat113"].apply(lambda x: "other" if x not in cat_113_domi else x)
test["cat109"] = test["cat109"].apply(lambda x: "other" if x not in ("BI","AB") else x)

# assign train to X
X = train

# Categorize the continuous variables
breakpoint_normal = [-0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint2 = [-0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]
breakpoint4 = [-0.01,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint5 = [-0.01,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint8 = [-0.01,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
breakpoint14 = [-0.01,0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]
breakpoint_special = [breakpoint2,breakpoint4,breakpoint5,breakpoint8,breakpoint14]

columns_normal_cut = ['cont1', 'cont3', 'cont6', 'cont7', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13',]
columns_special_cut = ['cont2','cont4','cont5','cont8','cont14']

# normal cut
for i in columns_normal_cut:
    X[i] = pd.cut(X[i],breakpoint_normal)
    test[i] = pd.cut(test[i],breakpoint_normal)

# special cut
k = 0
for i in columns_special_cut:
    X[i] = pd.cut(X[i],breakpoint_special[k])
    test[i] = pd.cut(test[i],breakpoint_special[k])

test.index = range(188318,188318+125546)
ALL = pd.concat([X,test],axis=0)

# transform the cats to dummy
obj = ALL.dtypes.index[ALL.dtypes.apply(lambda x: x in ["object","category"])]
for i in obj:
    new_cat = pd.get_dummies(ALL[i],prefix=i,drop_first=True)
    del ALL[i]
    ALL = pd.concat([ALL,new_cat],axis = 1)

X = ALL.ix[:188317,:]
test = ALL.ix[188318:,:]
del ALL

X = X.as_matrix()
test = test.as_matrix()
loss = loss.as_matrix()

# xgboost

In [15]:
import xgboost as xgb

In [16]:
gbm = xgb.XGBRegressor(max_depth=6, learning_rate=0.02, n_estimators=2000,silent=False,nthread=-1,
                      seed=42)

In [17]:
loss_log = np.log(loss)

In [18]:
gbm.fit(X,loss_log)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.02, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=2000, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=False, subsample=1)

In [19]:
y_pred_log = gbm.predict(test)
y_pred = np.exp(y_pred_log)

In [20]:
export = pd.DataFrame({"id":test_id,"loss":y_pred})
export.to_csv("../Sub/submission_gbm10.csv",index=False)

In [21]:
y_pred

array([ 1407.27856445,  1813.08435059,  8962.53320312, ...,  2737.31152344,
        1011.90887451,  3356.33422852], dtype=float32)