In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Import some libraries
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import time

The minimum supported version is 2.4.6



In [3]:
# Read the dataframes
store = pd.HDFStore('../../data/store_2016.h5')
t1 = time.time()
train = store["train"]
prop = store["prop"]
t2 = time.time()
print 'it took ', t2-t1, ' seconds to read the dataframes'

it took  402.871702909  seconds to read the dataframes


In [4]:
# Determine the upper and lower bounds for outlier removal
y = train.logerror
mid = np.percentile(y, 50)
y = y - mid
q1 = np.percentile(y, 25)
q3 = np.percentile(y, 75)
print q1, q3
interval = q3 - q1
fac = 8.0
interval = interval * fac / 2.
hi = interval + mid
lo = -interval + mid
print hi, lo

-0.0313 0.0332
0.264 -0.252


In [5]:
# use the first 9 months for train and last 3 months for test
x1 = train[train.month < 10]    # use for train
x0 = train[train.month > 9]     # use for test
print "Size of the x1 data frame: ", x1.shape
print "Size of the x0 data frame: ", x0.shape

y1 = x1['logerror'].values
y0 = x0['logerror'].values

index_hi = y1 > hi  
index_lo = y1 < lo   
print sum(index_hi), sum(index_lo)

y1 = y1[(~index_lo) & (~index_hi)]
x1 = x1[(~index_lo) & (~index_hi)]

print "Size of the x1 data frame: ", x1.shape
print "Size of the x0 data frame: ", x0.shape

Size of the x1 data frame:  (81635, 262)
Size of the x0 data frame:  (8515, 262)
2084 1431
Size of the x1 data frame:  (78120, 262)
Size of the x0 data frame:  (8515, 262)


In [6]:
# make a xgb regressor object
import xgb
import multiprocessing

ncpu = multiprocessing.cpu_count()
print "number of cores " + str(ncpu)

model = xgb.XGBoostReg(
        eval_metric = 'mae',
        nthread = ncpu,
        eta = 0.01,
        max_depth = 9,
        subsample = 1.,
        colsample_bytree = .5,
        min_child_weight = 70,
        silent = 1
        )
nround = 470
from sklearn.metrics import mean_absolute_error

number of cores 6




In [7]:
# fit the model and compute the scores
model.fit(x1.drop(["month", "logerror"], axis=1), y1, num_boost_round= nround) # Train the model without outliers

In [8]:
from sklearn.metrics import mean_absolute_error

print "Error on training data ", mean_absolute_error(y1, model.predict(x1.drop(["month", "logerror"], axis=1)))
print "Error on 3 months test ", mean_absolute_error(y0, model.predict(x0.drop(["month", "logerror"], axis=1)))

Error on training data  0.04346076361
Error on 3 months test  0.0644482796357


# Cross-Validation

In [9]:
y = train.logerror
mid = np.percentile(y, 50)
y = y - mid
q1 = np.percentile(y, 25)
q3 = np.percentile(y, 75)
print q1, q3

fac = 8.0
interval = q3 - q1
interval = interval * fac / 2.
hi_train = interval + mid
lo_train = -interval + mid

fac = 65.0
interval = q3 - q1
interval = interval * fac / 2.
hi_test = interval + mid
lo_test = -interval + mid

print lo_train, hi_train
print lo_test, hi_test

-0.0313 0.0332
-0.252 0.264
-2.09025 2.10225


In [10]:
# Generate a list of outliers for training and testing. This let us make predictions and compute 
# the scores for all datapoints
y = train['logerror'].values
x = train.drop(['month', 'logerror'], axis=1)
print "Size of the train data frame: ", x.shape
print "Size of the prop  data frame: ", prop.shape

print("Generate a list of outliers that should be droped for training")
index_hi = y > hi_train   
index_lo = y < lo_train   
print sum(index_hi), sum(index_lo)

outliers_train = []
for ii in range(y.shape[0]):
    if index_hi[ii] or index_lo[ii]:
        outliers_train.append(ii)
        
print("Generate a list of outliers that should be droped for testing")
index_hi = y > hi_test   
index_lo = y < lo_test   
print sum(index_hi), sum(index_lo)

outliers_test = []
for ii in range(y.shape[0]):
    if index_hi[ii] or index_lo[ii]:
        outliers_test.append(ii)

Size of the train data frame:  (90150, 260)
Size of the prop  data frame:  (2883630, 260)
Generate a list of outliers should be droped for training
2310 1568
Generate a list of outliers should be droped for testing
51 46


In [11]:
def splitDataFrameIntoSmaller(df, chunkSize = 100000): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(i*chunkSize)
    listOfDf.append(len(df))
    return listOfDf

split_index = splitDataFrameIntoSmaller(prop)

In [12]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

skf = KFold(n_splits = 10, shuffle = True, random_state = 44)

train_pred = np.zeros(train.shape[0], dtype=np.float16)
prop_pred = np.zeros(prop.shape[0], dtype=np.float16)
scores1 = []; scores2 = []

for train_index, test_index in skf.split(x, y):
    
    train_index_wo = [ix for ix in train_index if ix not in outliers_train]
    test_index_wo = [ix for ix in test_index if ix not in outliers_test]
     
    x1, x0 = x.iloc[train_index_wo], x.iloc[test_index_wo]
    y1, y0 = y[train_index_wo], y[test_index_wo]
    
    model.fit(x1.drop(["id_parcel"], axis=1), y1, num_boost_round= nround) # Train the model without outliers
    
    #calculate score without second outliers
    scores1.append(mean_absolute_error(y0, model.predict(x0.drop(["id_parcel"], axis=1))))
    print "Score without outliers for the ", len(scores1), " fold is ", scores1[len(scores1)-1]
    
    #calculate score with outliers
    x0 = x.iloc[test_index]
    y0 = y[test_index]
    
    pred = model.predict(x0.drop(["id_parcel"], axis=1))
    scores2.append(mean_absolute_error(y0, pred))
    
    for ii, idx in enumerate(test_index):
        train_pred[idx] = pred[ii]
    
    for ii in range(0, len(split_index)-1):
        n1 = split_index[ii]; n2 = split_index[ii+1]
        pred = model.predict(prop.iloc[n1:n2].drop(['id_parcel'], axis=1))
        prop_pred[n1:n2] += pred
    
print "Average score without outliers over all folds : " , np.mean(scores1), " ", np.std(scores1)
print "Average score with    outliers over all folds : " , np.mean(scores2), " ", np.std(scores2)



KeyboardInterrupt: 

In [None]:
# Prepare the predictions for submission
out = pd.DataFrame()
out["ParcelId"] = prop["id_parcel"]
months = ["201610", "201611", "201612", "201710", "201711", "201712"]
for col in months:
    out[col] = map(lambda x: x/10.0, prop_pred)
    
out_train = pd.DataFrame()
out_train["ParcelId"] = train["id_parcel"]
for col in months:
    out_train[col] = train_pred #+ 0.02 #IMPORTANT POINT: I add a constant to train prediction


print("Read the missing")
miss = store["miss"]

med = train.logerror.median()
for col in months:
    miss[col] = med
    
miss = miss[["id_parcel"]+months]
miss.columns = ["ParcelId"] + months

out = pd.concat([out, out_train, miss], axis=0)

from datetime import datetime
out.to_csv('xgboost.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')