In [None]:
%load_ext Cython
import os,sys,random,datetime,time,itertools
import cPickle as pickle, numpy as np, pandas as pd
import gc
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb

# Load Data

## Compute pairwise distances

In [None]:
prop=pd.read_csv('raw/properties_2016.csv.zip',usecols=['parcelid','latitude','longitude'],index_col=0)
prop['latitude'] /= 1e6;     prop['latitude'] *= 69
prop['longitude'] /= 1e6;    prop['longitude'] *= 57.393401296
prop=(prop-prop.mean()).dropna(0)

In [None]:
X = prop.values.copy(order='C')
dist_rings = [1,2,5,10,20,30,50]

In [None]:
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_v2(np.ndarray[double, ndim=2, mode='c'] X not None, np.int_t n): #mode='c' indicates C-ordered (contiguous in memory)
    
    cdef np.intp_t i, j, n_samples, N # np.intp: Integer used for indexing (unsigned). _t means the type of it
    cdef double tmp, d 
    n_samples = X.shape[0]
    N = X.shape[1]

    cdef np.ndarray[np.int_t, ndim=2, mode='c'] D = np.zeros((n,7),dtype=int)
    for i in range(n):
        for j in range(n_samples):
            d = 0
            for k in range(N):
                tmp = X[i,k] - X[j,k]
                d += tmp * tmp
            if d<=1:
                D[i,0]+=1
                D[j,0]+=1
            elif d<=4:
                D[i,1]+=1
                D[j,1]+=1
            elif d<=25:
                D[i,2]+=1
                D[j,2]+=1
            elif d<=100:
                D[i,3]+=1
                D[j,3]+=1
            elif d<=400:
                D[i,4]+=1
                D[j,4]+=1
            elif d<=900:
                D[i,5]+=1
                D[j,5]+=1
            elif d<=2500:
                D[i,6]+=1
                D[j,6]+=1
    return D

In [None]:
# Y = pairwise_v2(X,X.shape[0])
pd.DataFrame(Y,columns=map(lambda d:"N{}".format(d),dist_rings),index=prop.index[:Y.shape[0]]).to_csv('raw/neighbors.csv.bz2',index=True,compression='bz2')

## feature engineering

In [None]:
target='logerror'
outlier = (-0.5,0.5)
dropfeat = ['index','parcelid', 'ParcelId', 'transactiondate', target,'201610','201611','201612','201710','201711','201712']
catfeat = ['propertyzoningdesc', 'propertycountylandusecode',
           'airconditioningtypeid','architecturalstyletypeid','buildingqualitytypeid',
           'heatingorsystemtypeid','propertylandusetypeid','regionidcounty','typeconstructiontypeid',
           'N-PropType']
boolfeat = ['hashottuborspa', 'fireplaceflag', 'taxdelinquencyflag']
numfeat = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingclasstypeid', 'calculatedbathnbr', 'decktypeid',
           'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet13',
           'finishedsquarefeet15', 'finishedsquarefeet50', 'finishedsquarefeet6', 'fips', 'fireplacecnt', 'fullbathcnt',
           'garagecarcnt', 'garagetotalsqft', 'latitude', 'longitude', 'lotsizesquarefeet', 'poolcnt', 'poolsizesum',
           'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'rawcensustractandblock', 'regionidcity', 'regionidneighborhood',
           'regionidzip', 'roomcnt', 'storytypeid', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17',
           'yardbuildingsqft26', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
           'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock']

In [None]:
for year in [2016,2017]:
    prop=pd.read_csv('raw/properties_{}.csv.zip'.format(year))

    prop['N-life'] = 2018 - prop['yearbuilt']
    prop['N-LivingAreaError'] = prop['calculatedfinishedsquarefeet']/prop['finishedsquarefeet12']
    prop['N-LivingAreaProp'] = prop['calculatedfinishedsquarefeet']/prop['lotsizesquarefeet']
    prop['N-LivingAreaProp2'] = prop['finishedsquarefeet12']/prop['finishedsquarefeet15']
    prop['N-ExtraSpace'] = prop['lotsizesquarefeet'] - prop['calculatedfinishedsquarefeet'] 
    prop['N-ExtraSpace-2'] = prop['finishedsquarefeet15'] - prop['finishedsquarefeet12'] 
    prop['N-TotalRooms'] = prop['bathroomcnt']+prop['bedroomcnt']
    prop['N-AvRoomSize'] = prop['calculatedfinishedsquarefeet']/prop['roomcnt'] 
    prop['N-ExtraRooms'] = prop['roomcnt'] - prop['N-TotalRooms'] 
    prop['N-ValueProp'] = prop['structuretaxvaluedollarcnt']/prop['landtaxvaluedollarcnt']
    prop['N-GarPoolAC'] = ((prop['garagecarcnt']>0) & (prop['pooltypeid10']>0) & (prop['airconditioningtypeid']!=5))*1 
    prop["N-location"] = prop["latitude"] + prop["longitude"]
    prop["N-location-2"] = prop["latitude"]*prop["longitude"]
    prop["N-location-2round"] = prop["N-location-2"].round(-4)
    prop["N-latitude-round"] = prop["latitude"].round(-4)
    prop["N-longitude-round"] = prop["longitude"].round(-4)

    prop['N-ValueRatio'] = prop['taxvaluedollarcnt']/prop['taxamount']
    prop['N-TaxScore'] = prop['taxvaluedollarcnt']*prop['taxamount']
    prop["N-taxdelinquencyyear-2"] = prop["taxdelinquencyyear"] ** 2
    prop["N-taxdelinquencyyear-3"] = prop["taxdelinquencyyear"] ** 3
    prop['N-life_tax'] = year - prop['taxdelinquencyyear']
    zip_count = prop['regionidzip'].value_counts().to_dict()
    prop['N-zip_count'] = prop['regionidzip'].map(zip_count)
    city_count = prop['regionidcity'].value_counts().to_dict()
    prop['N-city_count'] = prop['regionidcity'].map(city_count)
    region_count = prop['regionidcounty'].value_counts().to_dict()
    prop['N-county_count'] = prop['regionidcounty'].map(region_count)

    prop['N-ACInd'] = (prop['airconditioningtypeid']!=5)*1
    prop['N-HeatInd'] = (prop['heatingorsystemtypeid']!=13)*1
    prop['N-PropType'] = prop.propertylandusetypeid.replace({31 : "Mixed", 46 : "Other", 47 : "Mixed", 246 : "Mixed", 247 : "Mixed", 248 : "Mixed", 260 : "Home", 261 : "Home", 262 : "Home", 263 : "Home", 264 : "Home", 265 : "Home", 266 : "Home", 267 : "Home", 268 : "Home", 269 : "Not Built", 270 : "Home", 271 : "Home", 273 : "Home", 274 : "Other", 275 : "Home", 276 : "Home", 279 : "Home", 290 : "Not Built", 291 : "Not Built" })
    prop["N-structuretaxvaluedollarcnt-2"] = prop["structuretaxvaluedollarcnt"] ** 2
    prop["N-structuretaxvaluedollarcnt-3"] = prop["structuretaxvaluedollarcnt"] ** 3

    group = prop.groupby('regionidcity')['structuretaxvaluedollarcnt'].aggregate('mean').to_dict()
    prop['N-Avg-structuretaxvaluedollarcnt'] = prop['regionidcity'].map(group)
    prop['N-Dev-structuretaxvaluedollarcnt'] = abs((prop['structuretaxvaluedollarcnt'] - prop['N-Avg-structuretaxvaluedollarcnt']))/prop['N-Avg-structuretaxvaluedollarcnt']

    prop['latitude'] /= 1e6;     prop['latitude'] *= 69
    prop['longitude'] /= 1e6;    prop['longitude'] *= 57.393401296
    prop['dist_from_cent'] = np.sqrt((prop['latitude']-prop['latitude'].mean())**2+(prop['longitude']-prop['longitude'].mean())**2)

    dist = pd.read_csv('raw/neighbors.csv.bz2')
    for i in xrange(1,7): dist.iloc[:,i+1]+=dist.iloc[:,i]
    prop = prop.merge(dist,how='left',on='parcelid')
    train=pd.read_csv('raw/train_{}.csv.zip'.format(year)).sort_values('transactiondate')
    for c, dtype in zip(prop.columns, prop.dtypes): 
        if dtype == np.float64:
            prop[c] = prop[c].astype(np.float32)

    train = train.merge(prop, how='left', on='parcelid')
    train.transactiondate=pd.to_datetime(train.transactiondate)

    if year==2016: traincomb=train
    else: traincomb=pd.concat((traincomb,train)).reset_index()

train=traincomb
test=pd.read_csv('raw/sample_submission.csv.zip')
test['parcelid'] = test['ParcelId']
test = test.merge(prop, how='left', on='parcelid')
for c in boolfeat:
    test[c] = (test[c] == train[c].dropna().iloc[0])
    train[c] = (train[c] == train[c].dropna().iloc[0])
for c in catfeat:
    enc = pd.factorize(pd.concat((train[c],test[c])),sort=False, na_sentinel=0)[0].astype(int) #sort=True gives 0
    train[c]=enc[:train.shape[0]]
    test[c]=enc[train.shape[0]:]
    test[c].iloc[np.where(~test[c].isin(set(train[c].values)))]=np.nan
train['month']=train.transactiondate.dt.month
train['year']=train.transactiondate.dt.year
trainN=train.shape[0]
cols=[c for c in train if c not in dropfeat]
extra_y=train[['month',target]]

# Level-1

In [None]:
ms=np.append(np.where(extra_y['month']!=extra_y['month'].shift())[0],train.shape[0])
train2=pd.DataFrame(index=train.index[ms[13]:],columns=[])
test2=pd.DataFrame(index=test.index,columns=[])

In [None]:
trainvaltestpred=\
[(np.r_[range(ms[1]),range(ms[1],ms[2],2),range(ms[2],ms[12]),range(ms[12],ms[13],2),range(ms[14],ms[-1])], np.r_[range(ms[12]+1,ms[13],2),range(ms[1]+1,ms[2],2)], range(ms[13],ms[14]), [] ), 
 (np.r_[range(ms[2]),range(ms[2],ms[3],2),range(ms[3],ms[13]),range(ms[13],ms[14],2),range(ms[15],ms[-1])], np.r_[range(ms[13]+1,ms[14],2),range(ms[2]+1,ms[3],2)], range(ms[14],ms[15]), [] ), 
 (np.r_[range(ms[3]),range(ms[3],ms[4],2),range(ms[4],ms[14]),range(ms[14],ms[15],2),range(ms[16],ms[-1])], np.r_[range(ms[14]+1,ms[15],2),range(ms[3]+1,ms[4],2)], range(ms[15],ms[16]), [] ), 
 (np.r_[range(ms[4]),range(ms[4],ms[5],2),range(ms[5],ms[15]),range(ms[15],ms[16],2),range(ms[17],ms[-1])], np.r_[range(ms[15]+1,ms[16],2),range(ms[4]+1,ms[5],2)], range(ms[16],ms[17]), [] ), 
 (np.r_[range(ms[5]),range(ms[5],ms[6],2),range(ms[6],ms[16]),range(ms[16],ms[17],2),range(ms[18],ms[-1])], np.r_[range(ms[16]+1,ms[17],2),range(ms[5]+1,ms[6],2)], range(ms[17],ms[18]), [] ), 
 (np.r_[range(ms[6]),range(ms[6],ms[7],2),range(ms[7],ms[17]),range(ms[17],ms[18],2),range(ms[19],ms[-1])], np.r_[range(ms[17]+1,ms[18],2),range(ms[6]+1,ms[7],2)], range(ms[18],ms[19]), [] ), 
 (np.r_[range(ms[7]),range(ms[7],ms[8],2),range(ms[8],ms[18]),range(ms[18],ms[19],2),range(ms[20],ms[-1])], np.r_[range(ms[18]+1,ms[19],2),range(ms[7]+1,ms[8],2)], range(ms[19],ms[20]), [] ), 
 (np.r_[range(ms[8]),range(ms[8],ms[9],2),range(ms[9],ms[19]),range(ms[19],ms[20],2),range(ms[21],ms[-1])], np.r_[range(ms[19]+1,ms[20],2),range(ms[8]+1,ms[9],2)], range(ms[20],ms[21]), [] ), 
 (np.r_[range(ms[ 9]),range(ms[10],ms[20]),range(ms[20],ms[21],2)], np.r_[range(ms[20]+1,ms[21],2),range(ms[ 9],ms[10])], range(ms[20],ms[21]), [10] ), 
 (np.r_[range(ms[10]),range(ms[11],ms[20]),range(ms[20],ms[21],2)], np.r_[range(ms[20]+1,ms[21],2),range(ms[10],ms[11])], range(ms[20],ms[21]), [11] ), 
 (np.r_[range(ms[11]),range(ms[12],ms[20]),range(ms[20],ms[21],2)], np.r_[range(ms[20]+1,ms[21],2),range(ms[11],ms[12])], range(ms[20],ms[21]), [12] ), 
]
patience=100  #for early stopping
maxncols=500
nmonths=8

## Level-1 xgboost

In [None]:
xgbparams ={'eta': 0.01,'max_depth': 5,'subsample': .7,'colsample_bytree': 0.7,
            'objective': 'reg:linear','eval_metric': 'mae','silent': 1}
plist = [[1.142214,0.207717,7,0.993270],
         [2.883730,0.121081,9,0.969598],
         [2.566404,0.160620,10,0.909566]]
clist = [['airconditioningtypeid','architecturalstyletypeid','basementsqft','bathroomcnt','bedroomcnt','buildingclasstypeid','buildingqualitytypeid','calculatedbathnbr','decktypeid','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fips','fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','hashottuborspa','heatingorsystemtypeid','latitude','longitude','lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip','roomcnt','storytypeid','threequarterbathnbr','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories','fireplaceflag','structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount','taxdelinquencyflag','taxdelinquencyyear','censustractandblock','N-life','N-LivingAreaError','N-LivingAreaProp','N-ExtraSpace','N-TotalRooms','N-AvRoomSize','N-ExtraRooms','N-ValueProp','N-GarPoolAC','N-location','N-location-2','N-location-2round','N-latitude-round','N-longitude-round','N-ValueRatio','N-TaxScore','N-taxdelinquencyyear-2','N-taxdelinquencyyear-3','N-life_tax','N-zip_count','N-city_count','N-county_count','N-ACInd','N-HeatInd','N-PropType','N-structuretaxvaluedollarcnt-2','N-structuretaxvaluedollarcnt-3','N-Avg-structuretaxvaluedollarcnt','N-Dev-structuretaxvaluedollarcnt','dist_from_cent','N1','N2','N5','N10','N20','N30','N50','month','year'],
         ['airconditioningtypeid','architecturalstyletypeid','basementsqft','bathroomcnt','bedroomcnt','buildingclasstypeid','buildingqualitytypeid','calculatedbathnbr','decktypeid','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fips','fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','hashottuborspa','heatingorsystemtypeid','latitude','longitude','lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid2','pooltypeid7','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip','roomcnt','storytypeid','threequarterbathnbr','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories','fireplaceflag','structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount','taxdelinquencyflag','taxdelinquencyyear','censustractandblock','N-life','N-LivingAreaError','N-LivingAreaProp','N-LivingAreaProp2','N-ExtraSpace','N-TotalRooms','N-AvRoomSize','N-ExtraRooms','N-ValueProp','N-GarPoolAC','N-location','N-location-2','N-location-2round','N-latitude-round','N-longitude-round','N-ValueRatio','N-TaxScore','N-taxdelinquencyyear-2','N-taxdelinquencyyear-3','N-life_tax','N-zip_count','N-city_count','N-county_count','N-ACInd','N-HeatInd','N-PropType','N-structuretaxvaluedollarcnt-2','N-structuretaxvaluedollarcnt-3','N-Avg-structuretaxvaluedollarcnt','N-Dev-structuretaxvaluedollarcnt','dist_from_cent','N1','N2','N5','N10','N20','N30','N50','month','year'],
         ['airconditioningtypeid','architecturalstyletypeid','basementsqft','bathroomcnt','bedroomcnt','buildingclasstypeid','buildingqualitytypeid','calculatedbathnbr','decktypeid','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fips','fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','hashottuborspa','heatingorsystemtypeid','latitude','longitude','lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip','roomcnt','storytypeid','threequarterbathnbr','typeconstructiontypeid','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories','fireplaceflag','structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount','taxdelinquencyflag','taxdelinquencyyear','censustractandblock','N-life','N-LivingAreaError','N-LivingAreaProp','N-ExtraSpace','N-TotalRooms','N-AvRoomSize','N-ExtraRooms','N-ValueProp','N-GarPoolAC','N-location','N-location-2','N-location-2round','N-latitude-round','N-longitude-round','N-ValueRatio','N-TaxScore','N-taxdelinquencyyear-2','N-taxdelinquencyyear-3','N-life_tax','N-zip_count','N-city_count','N-county_count','N-ACInd','N-HeatInd','N-PropType','N-structuretaxvaluedollarcnt-2','N-structuretaxvaluedollarcnt-3','N-Avg-structuretaxvaluedollarcnt','N-Dev-structuretaxvaluedollarcnt','dist_from_cent','N1','N2','N5','N10','N20','N30','N50','month','year']]
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = preds-labels
    con = np.average(np.abs(x))*.9
    grad = con*x / (np.abs(x)+con)
    hess = con**2 / (np.abs(x)+con)**2
    return grad, hess

In [None]:
for i in range(len(plist)):
    xgbp = xgbparams.copy()
    xgbp['alpha']=plist[i][0]
    xgbp['colsample_bytree'] = plist[i][1]
    xgbp['max_depth'] = plist[i][2]
    xgbp['subsample'] = plist[i][3]
    chosen = clist[i]
    train_x = train.ix[:,chosen]
    test_x = test.ix[:,chosen]
    train_y = extra_y[target]
    train2col = pd.Series(index=train.index)
    for (trai,vali,tesi,predm) in trainvaltestpred:
        X_tra, y_tra = train_x.iloc[trai], train_y.iloc[trai]
        X_val, y_val = train_x.iloc[vali], train_y.iloc[vali]
        X_tes, y_tes = train_x.iloc[tesi], train_y.iloc[tesi]
        d_tra = xgb.DMatrix(X_tra, label=y_tra)
        d_val = xgb.DMatrix(X_val, label=y_val)
        d_tes = xgb.DMatrix(X_tes, label=y_tes)
        model = xgb.train(xgbp,d_tra,
                          num_boost_round=100000,
                          evals=[(d_val, 'eval')],
                          early_stopping_rounds=patience,
                          obj=fair_obj,
                          verbose_eval=1000)
        if X_tes.shape[0]:
            train2col.iloc[tesi]=model.predict(d_tes,ntree_limit=model.best_ntree_limit)
        for month in predm:
            if 'month' in test_x: test_x.month=month
            if 'year' in test_x: test_x.year=2017
            d_test = xgb.DMatrix(test_x)
            test2['2017%d_xgb%d'%(month,i)]=model.predict(d_test,ntree_limit=model.best_ntree_limit)
    train2['xgb%d'%i]=train2col.iloc[ms[13]:]

In [None]:
train2.columns

## Level-1 lightgbm

In [None]:
lgbparams ={'learning_rate': 0.002,'max_bin':10,'bagging_freq':20,'bagging_fraction':.85,
            'objective': 'regression_l1','metric': 'mae','silent': 1}
plist = [[0.838503,6.947982,7.765055,6.666210,0.480870],
         [0.916033,7.015071,6.215446,9.446752,0.173594],
         [0.500000,12.969876,8.000000,8.077548,0.988285]]
clist = [['airconditioningtypeid','architecturalstyletypeid','basementsqft','bathroomcnt','bedroomcnt','buildingclasstypeid','buildingqualitytypeid','calculatedbathnbr','decktypeid','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fips','fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','hashottuborspa','heatingorsystemtypeid','latitude','longitude','lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip','roomcnt','threequarterbathnbr','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories','fireplaceflag','structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount','taxdelinquencyflag','taxdelinquencyyear','censustractandblock','N-life','N-LivingAreaError','N-LivingAreaProp','N-ExtraSpace','N-TotalRooms','N-AvRoomSize','N-ExtraRooms','N-ValueProp','N-location','N-location-2','N-location-2round','N-latitude-round','N-longitude-round','N-ValueRatio','N-TaxScore','N-taxdelinquencyyear-2','N-taxdelinquencyyear-3','N-life_tax','N-zip_count','N-city_count','N-county_count','N-ACInd','N-PropType','N-structuretaxvaluedollarcnt-2','N-structuretaxvaluedollarcnt-3','N-Avg-structuretaxvaluedollarcnt','N-Dev-structuretaxvaluedollarcnt','dist_from_cent','N1','N2','N5','N10','N20','N30','N50','month','year'],
         ['airconditioningtypeid','bathroomcnt','bedroomcnt','buildingqualitytypeid','calculatedbathnbr','decktypeid','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fips','fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','hashottuborspa','heatingorsystemtypeid','latitude','longitude','lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip','roomcnt','threequarterbathnbr','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories','fireplaceflag','structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount','taxdelinquencyyear','censustractandblock','N-life','N-LivingAreaError','N-LivingAreaProp','N-ExtraSpace','N-TotalRooms','N-AvRoomSize','N-ExtraRooms','N-ValueProp','N-location','N-location-2','N-location-2round','N-latitude-round','N-longitude-round','N-ValueRatio','N-TaxScore','N-taxdelinquencyyear-3','N-life_tax','N-zip_count','N-city_count','N-county_count','N-PropType','N-structuretaxvaluedollarcnt-2','N-structuretaxvaluedollarcnt-3','N-Avg-structuretaxvaluedollarcnt','N-Dev-structuretaxvaluedollarcnt','dist_from_cent','N1','N2','N5','N10','N20','N30','N50','month','year'],
         ['airconditioningtypeid','basementsqft','bathroomcnt','bedroomcnt','buildingclasstypeid','buildingqualitytypeid','calculatedbathnbr','decktypeid','finishedfloor1squarefeet','calculatedfinishedsquarefeet','finishedsquarefeet12','finishedsquarefeet13','finishedsquarefeet15','finishedsquarefeet50','finishedsquarefeet6','fips','fireplacecnt','fullbathcnt','garagecarcnt','garagetotalsqft','hashottuborspa','heatingorsystemtypeid','latitude','longitude','lotsizesquarefeet','poolcnt','poolsizesum','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode','propertylandusetypeid','propertyzoningdesc','rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip','roomcnt','threequarterbathnbr','unitcnt','yardbuildingsqft17','yardbuildingsqft26','yearbuilt','numberofstories','structuretaxvaluedollarcnt','taxvaluedollarcnt','assessmentyear','landtaxvaluedollarcnt','taxamount','taxdelinquencyflag','taxdelinquencyyear','censustractandblock','N-life','N-LivingAreaError','N-LivingAreaProp','N-ExtraSpace','N-TotalRooms','N-AvRoomSize','N-ExtraRooms','N-ValueProp','N-GarPoolAC','N-location','N-location-2','N-location-2round','N-latitude-round','N-longitude-round','N-ValueRatio','N-TaxScore','N-taxdelinquencyyear-2','N-taxdelinquencyyear-3','N-life_tax','N-zip_count','N-city_count','N-county_count','N-PropType','N-structuretaxvaluedollarcnt-2','N-structuretaxvaluedollarcnt-3','N-Avg-structuretaxvaluedollarcnt','N-Dev-structuretaxvaluedollarcnt','dist_from_cent','N1','N2','N5','N10','N20','N30','N50','month','year']]

In [None]:
for i in range(len(plist)):
    lgbp = lgbparams.copy()
    lgbp['bagging_fraction']=plist[i][0]
    lgbp['max_bin'] = int(2**plist[i][1])
    lgbp['min_data'] = int(2**plist[i][2])
    lgbp['num_leaves'] = int(2**plist[i][3])
    lgbp['sub_feature'] = plist[i][4]
    chosen = clist[i]
    chosencat = [c for c in chosen if c in catfeat]
    train_x = train.ix[:,chosen]
    test_x = test.ix[:,chosen]
    train_y = extra_y[target]
    train2col = pd.Series(index=train.index)
    for (trai,vali,tesi,predm) in trainvaltestpred:
        X_tra, y_tra = train_x.iloc[trai], train_y.iloc[trai]
        X_val, y_val = train_x.iloc[vali], train_y.iloc[vali]
        X_tes, y_tes = train_x.iloc[tesi], train_y.iloc[tesi]
        d_tra = lgb.Dataset(X_tra, label=y_tra, silent=True)
        d_val = lgb.Dataset(X_val, label=y_val, silent=True)
        d_tes = lgb.Dataset(X_tes, label=y_tes, silent=True)
        model = lgb.train(lgbp,d_tra,num_boost_round=100000,early_stopping_rounds=patience,
                          categorical_feature=chosencat,
                          valid_sets=[d_val],valid_names=['val'],
                          verbose_eval=1000)
        if X_tes.shape[0]:
            train2col.iloc[tesi]=model.predict(X_tes,num_iteration=model.best_iteration)
        for month in predm:
            if 'month' in test_x: test_x.month=month
            if 'year' in test_x: test_x.year=2017
            test2['2017%d_lgb%d'%(month,i)]=model.predict(test_x,num_iteration=model.best_iteration)
    train2['lgb%d'%i]=train2col.iloc[ms[13]:]

# Level-2

In [None]:
test3=pd.DataFrame(index=test.index,columns=[])

## Level-2 xgboost

In [None]:
xgbp = {'eta':0.01, 'objective':'reg:linear','eval_metric':'mae','silent':1,
        'alpha':0.217198, 'colsample_bytree':0.734576, 'max_depth':2, 'subsample':0.678575}
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = preds-labels
    con = np.average(np.abs(x))
    grad = con*x / (np.abs(x)+con)
    hess = con**2 / (np.abs(x)+con)**2
    return grad, hess

train_x = train2
test_x = test2
train_y = extra_y[target].tail(train_x.shape[0])
d_tr = xgb.DMatrix(train_x, label=train_y)      
cv = xgb.cv(xgbp,d_tr,nfold=20,num_boost_round=100000,early_stopping_rounds=patience,obj=fair_obj,
            verbose_eval=1000, show_stdv=False)
model = xgb.train(xgbp,d_tr,num_boost_round=cv.shape[0],verbose_eval=1000)

for month in [10,11,12]:
    if 'month' in train_x: test_x['month']=month
    if 'year' in train_x: test_x['year']=2017
    for c in train_x.columns:
        test_x[c]=test2['2017{}'.format(month)+'_'+c]
    d_test = xgb.DMatrix(test_x[train_x.columns])
    test3['2017%d_xgb'%month] = model.predict(d_test)

## Level-2 lightgbm

In [None]:
lgbp = {'learning_rate': 0.005,'objective': 'regression_l1','metric': 'mae',
        'bagging_fraction':0.976674,'sub_feature':0.709602,
        'max_bin':495,'min_data':3,'num_leaves':3}

train_x = train2
test_x = test2
train_y = extra_y[target].tail(train_x.shape[0])
d_tr = lgb.Dataset(train_x, label=train_y, silent=True)
cv = lgb.cv(lgbp,d_tr,nfold=20,stratified=False,
            num_boost_round=100000,early_stopping_rounds=patience,
            verbose_eval=1000)
cv = pd.DataFrame.from_dict(cv)
model = lgb.train(lgbp,d_tr,num_boost_round=cv.shape[0],verbose_eval=1000)

for month in [10,11,12]:
    if 'month' in train_x: test_x['month']=month
    if 'year' in train_x: test_x['year']=2017
    for c in train_x.columns:
        test_x[c]=test2['2017{}'.format(month)+'_'+c]
    test3['2017%d_lgb'%month] = model.predict(test_x)

In [None]:
test3.columns

# Level-3 Blending

In [None]:
xgbweight=0.2
test3['ParcelId']=prop['parcelid']
xgbc=[c for c in test3 if 'xgb' in c]
lgbc=[c for c in test3 if 'lgb' in c]
for month in [10,11,12]:
    pref16='2016{}'.format(month)
    pref17='2017{}'.format(month)
    test3[pref16]=xgbweight*test3[[c for c in xgbc if c.startswith(pref17)]].mean(1) +\
                  (1-xgbweight)*test3[[c for c in lgbc if c.startswith(pref17)]].mean(1)
    test3[pref17]=xgbweight*test3[[c for c in xgbc if c.startswith(pref17)]].mean(1) +\
                  (1-xgbweight)*test3[[c for c in lgbc if c.startswith(pref17)]].mean(1)
test3.to_csv('out.csv',columns=['ParcelId','201610','201611','201612','201710','201711','201712'],
         index=False,float_format='%.4f')