# Import Libraries

In [20]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

# Load Data

In [21]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('new_submission.csv')
prop = pd.read_csv('final_prop_encoded_onehot.csv') # replace with cleaned prop
train = train.merge(prop, how="left", on="parcelid")

# Data Fitting

In [22]:
x_train = train.drop(['parcelid','logerror','transactiondate','subdate'],1)
y_train = train.logerror.values.astype(np.float32)
print('x train shape: ', x_train.shape)
print('y train shape: ', y_train.shape)

x train shape:  (113039, 70)
y train shape:  (113039,)


In [23]:
test_size = 0.33
random_state = 42

In [24]:
X_train,X_test,y_train,y_test = train_test_split(x_train,y_train,test_size=test_size,random_state=random_state)
y_mean = np.mean(y_train)
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

X_train:  (75736, 70)
X_test:  (37303, 70)
y_train:  (75736,)
y_test:  (37303,)


# Training Model

Note: Test metrics to get model working

In [25]:
eta = [0.01, 0.1, 0.3]
max_depth = [3, 6, 10]

In [28]:
for e in eta:
    for md in max_depth:
        print('eta:',e,'max_depth:',md)
        xgb_params = {
            'eta': e,
            'max_depth': int(md),
            'objective': 'reg:linear',
            'eval_metric': 'mae',
            'base_score': y_mean
        }
        dtrain = xgb.DMatrix(X_train, y_train)
        dtest = xgb.DMatrix(X_test)

        num_boost_rounds = 250

        model = xgb.train(
            dict(xgb_params, silent=1), 
            dtrain, 
            num_boost_round=num_boost_rounds
        )
        y_pred = model.predict(dtest)
        print(mean_squared_error(y_pred, y_test))

eta: 0.01 max_depth: 3
0.0254392
eta: 0.01 max_depth: 6
0.0254883
eta: 0.01 max_depth: 10
0.0255633
eta: 0.1 max_depth: 3
0.0255063
eta: 0.1 max_depth: 6
0.0258815
eta: 0.1 max_depth: 10
0.0266529
eta: 0.3 max_depth: 3
0.0255839
eta: 0.3 max_depth: 6
0.02748
eta: 0.3 max_depth: 10
0.0290354


# Predicting Results

In [29]:
xgb_params = {
    'eta': 0.01,
    'max_depth': int(3),
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(prop.set_index('parcelid'))

num_boost_rounds = 250

model = xgb.train(
    dict(xgb_params, silent=1), 
    dtrain, 
    num_boost_round=num_boost_rounds
)
y_pred = model.predict(dtest)

In [30]:
prop['logerror'] = y_pred
output = sample.drop('Unnamed: 0',1).merge(prop.loc[:,['parcelid','logerror']], how='left', on='parcelid')

for c in output.columns[output.columns != 'parcelid']:
    output[c] = output['logerror']  

output = output.iloc[:,:-1]
output.fillna(0, inplace=True)

# Calculating MAE Score

In [31]:
from math import fabs as absolute

sub = output
test_size = len(test)

total_ae = 0.0
print("Processing ", test_size, "rows")
for index,row in test.iterrows():
    r = sub.loc[sub['parcelid'] == row[0]]
    
    pred_error = r[str(row[3])]
    total_ae += absolute(pred_error - row[1])
    if index % 10000 == 0 and index !=0:
        print("Completed: ", index, "rows")
        
print("===Process Completed===")
print("\n Final score:", total_ae/test_size)

Processing  54849 rows
Completed:  10000 rows
Completed:  20000 rows
Completed:  30000 rows
Completed:  40000 rows
Completed:  50000 rows
===Process Completed===

 Final score: 0.0693158769551591
