# Import Libaries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100



# Load Data

In [2]:
train_path = 'train.csv'
test_path = 'test.csv'
submission_path = 'new_submission.csv'
prop_path = 'final_prop_encoded_onehot.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)
prop = pd.read_csv(prop_path)

# Constants

In [4]:
parcelid = 'parcelid'
logerror = 'logerror'
transactiondate = 'transactiondate'
subdate = 'subdate'

# Data Fitting

In [5]:
train = train.merge(prop,how='left',on=parcelid)
train = train[train.latitude.notnull()]

In [7]:
x_train = train.drop([parcelid,logerror,transactiondate,subdate],1)
y_train = train.logerror.values.astype(np.float32)
print('x train shape: ',x_train.shape)
print('y train shape: ',y_train.shape)

x train shape:  (113009, 70)
y train shape:  (113009,)


In [8]:
X_train,X_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.33,random_state=42)
y_mean = np.mean(y_train)
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
print('y_train: ',y_train.shape)
print('y_test: ',y_test.shape)
print('y_mean: ',y_mean)

X_train:  (75716, 70)
X_test:  (37293, 70)
y_train:  (75716,)
y_test:  (37293,)
y_mean:  0.0133953


# Training Model

In [9]:
n_estimators = [80, 100, 500]
max_depth = [5, 9, 15]

In [None]:
for est in n_estimators:
    print('n_estimator:', est)
    for dep in max_depth:
        print('max_depth:', dep)
        regressor = RandomForestRegressor(max_depth=dep, n_estimators=est)
        regressor.fit(X_train,y_train)
        y_pred = regressor.predict(X_test)
        accuracies = cross_val_score(estimator=regressor,X=X_train,y=y_train,cv=10,n_jobs=-1)
        print('Accuracies:', accuracies)
        print('MSE:', mean_squared_error(y_test, y_pred))

# Predicting Results

Select the max_depth and n_estimators with the result that provides the minimum 

In [11]:
regressor = RandomForestRegressor(max_depth=15, n_estimators=80)
regressor.fit(X_train,y_train)
result = regressor.predict(prop.iloc[:,1:])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [13]:
prop['results'] = result
output = prop[['parcelid','results']]
submission = submission.iloc[:,1:]
submission = submission.merge(output, how='left', on='parcelid')

for c in submission.columns.values:
    if c != 'parcelid':
        submission[c] = submission['results']
        
submission.drop('results',1,inplace=True)
submission.fillna(0,inplace=True)

# Calculating MAE Score

In [19]:
from math import fabs as absolute

sub = submission
test_size = len(test)

total_ae = 0.0
print("Processing ", test_size, "rows")
for index,row in test.iterrows():
    r = sub.loc[sub['parcelid'] == row[0]]
    
    pred_error = r[str(row[3])]
    total_ae += absolute(pred_error - row[1])
    if index % 10000 == 0 and index !=0:
        print("Completed: ", index, "rows")
        
print("===Process Completed===")
print("\n Final score:", total_ae/test_size)

Processing  54849 rows
Completed:  10000 rows
Completed:  20000 rows
Completed:  30000 rows
Completed:  40000 rows
Completed:  50000 rows
===Process Completed===

 Final score: 0.07027695395379888
