# Zillow Challenge

##  Data input

In [39]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numpy import seterr,isneginf,array
from datetime import datetime
from pandas import compat
from operator import itemgetter
from sklearn import tree
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.neighbors import KDTree
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from IPython.display import Image
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', 200)
pd.options.display.float_format = '{:20,.2f}'.format
compat.PY3 = False

def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    
    # Etrainport our decision tree to graphviz format
    dot_file = tree.export_graphviz(decision_tree, out_file='images/' + name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename='images/' + name + '.png')

In [40]:
train_f = "train_2016_v2.csv"
train = pd.read_csv(train_f, index_col = 'parcelid', parse_dates=['transactiondate'])
train['transactiondate'] = pd.to_datetime(train['transactiondate']).astype(int)

properties_f = "properties_2016.csv"
properties = pd.read_csv(properties_f, index_col = 'parcelid')


train = train.join(properties) 
train_bak = train  # backup 

In [42]:
#### TEST MODE  #####
train = train_bak
train = train.sample(frac=.01)

In [44]:
_byzipcache = train.groupby(['regionidzip'])['logerror'].mean()

In [45]:
_byzipcache.describe()

count                 317.00
mean                    0.02
std                     0.10
min                    -0.38
25%                    -0.02
50%                     0.01
75%                     0.04
max                     0.69
Name: logerror, dtype: float64

##  Feature Analysis

In [9]:
# Clean features
for col in train:
    dt = train[col].dtype
    if dt == 'float64':
        train[col].fillna(-1, inplace='True')  
    elif dt == 'object':
        train = train.drop(col, axis = 1)
        #train = pd.get_dummies(train,columns=[col], dummy_na=True)

##  Feature selection

In [34]:
Y = train[['logerror']]
X = train.drop('logerror', axis=1)

In [36]:
X.describe()

count              90,268.00
mean                    0.02
std                     0.44
min                     0.00
25%                     0.01
50%                     0.01
75%                     0.01
max                   109.54
Name: taxrate, dtype: float64

In [30]:
Y.describe()

Unnamed: 0,logerror
count,90275.0
mean,0.01
std,0.16
min,-4.61
25%,-0.03
50%,0.01
75%,0.04
max,4.74


## Modeling

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.2)

In [27]:
# Fit regression model
max_depth = None
min_samples_split = int(round(len(X_train)/50, 0))
min_samples_leaf = int(round(len(X_train)/20, 0))
print("min_samples_split =", min_samples_split)
print("min_samples_split =", min_samples_leaf)

model = DecisionTreeRegressor(#max_depth=max_depth, 
                              min_samples_split = min_samples_split,
                              #min_samples_leaf = min_samples_leaf,
                              criterion = "mae")
model.fit(X_train, Y_train)

('min_samples_split =', 361)
('min_samples_split =', 902)


DecisionTreeRegressor(criterion='mae', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=361, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [28]:
Y_predict=model.predict(X_test)

In [29]:
# (<= value) left = false, right = true
zip(X.columns[model.tree_.feature], model.tree_.threshold, model.tree_.children_left, model.tree_.children_right)

[('taxrate', 0.0271768718957901, 1, 618),
 ('calculatedfinishedsquarefeet', 2696.5, 2, 543),
 ('taxvaluedollarcnt', 9635.5, 3, 4),
 ('taxrate', -2.0, -1, -1),
 ('transactiondate', 1.4726447799960863e+18, 5, 436),
 ('taxvaluedollarcnt', 25694.0, 6, 7),
 ('taxrate', -2.0, -1, -1),
 ('finishedsquarefeet12', 1513.5, 8, 271),
 ('taxamount', 1592.2950439453125, 9, 34),
 ('finishedsquarefeet15', 1728.5, 10, 33),
 ('taxrate', 0.020684625953435898, 11, 32),
 ('taxrate', 0.020643617957830429, 12, 31),
 ('regionidzip', 96016.5, 13, 14),
 ('taxrate', -2.0, -1, -1),
 ('taxvaluedollarcnt', 26412.0, 15, 16),
 ('taxrate', -2.0, -1, -1),
 ('taxamount', 1354.375, 17, 30),
 ('calculatedfinishedsquarefeet', 869.5, 18, 19),
 ('taxrate', -2.0, -1, -1),
 ('yearbuilt', 1898.5, 20, 21),
 ('taxrate', -2.0, -1, -1),
 ('longitude', -119283328.0, 22, 23),
 ('taxrate', -2.0, -1, -1),
 ('latitude', 34644976.0, 24, 29),
 ('regionidcity', 396302.5, 25, 28),
 ('regionidcity', 41188.5, 26, 27),
 ('taxrate', -2.0, -1, -1

In [30]:
#Decision_Tree_Image(model, X_train.columns)

## Model evaluation

In [31]:
# Zillow MAE (test)
print("MAE: ",metrics.mean_absolute_error(Y_test['logerror'], Y_predict))

('MAE: ', 0.072661837441152038)


## Model Execution

In [469]:
sample_submission_f = '/Users/Jose/Desktop/Zillow/sample_submission.csv'
submission = pd.read_csv(sample_submission_f, index_col='ParcelId')

In [470]:
#  ATTEMPT 2 - 
logerroravg = train['logerror'].mean()
print("Using logerroravg = ", logerroravg)

('Using logerroravg = ', 0.011457219606756525)


In [546]:
X_all = properties

In [5]:
# Add taxrate
X_all['taxrate'] = np.divide(X_all['taxamount'],X_all['taxvaluedollarcnt'])

In [8]:
# Add roomcnt_offby
X_all['roomcnt_offby'] = np.where(
                                X_all['roomcnt']==0,
                                0,
                                X_all['roomcnt'].add(X_all['bathroomcnt'])
                            )

In [9]:
# Clean features
for col in train:
    dt = X_all[col].dtype
    if dt == 'float64':
        X_all[col].fillna(-1, inplace='True')  
    elif dt == 'object':
        X_all = pd.get_dummies(X_all,columns=[col], dummy_na=True)

In [534]:
Y_all = model.predict(X_all)

In [None]:
results = pd.DataFrame(index=X_all.index)
results.index.names = ['ParcelId']
results['201610'] = Y_all
results['201611'] = Y_all
results['201612'] = Y_all
results['201710'] = Y_all
results['201711'] = Y_all
results['201712'] = Y_all

In [None]:
submission = submission.drop(submission.columns[0:], axis=1)
submission = submission.join(results)

# Use average for properties with missing data
submission.fillna(logerroravg, inplace=True)       
submission.columns

In [None]:
# Round as per rules
submission = submission.round(4)

## Sanity Checks

In [None]:
# Average log error
submission.describe().round(4)

In [None]:
# Check for NaN
submission.isnull().sum()

In [None]:
# Check if any duplicates
submission[submission.index.duplicated(keep=False)]

In [None]:
# Check additional values in submission file
submission[~submission.index.isin(properties.index)]

In [None]:
# Check additional values in properties file
properties[~properties.index.isin(submission.index)]

## Dump File

In [None]:
# Write file
submission_f = '/Users/Jose/Desktop/Zillow/submission.csv'
submission_fh = open(submission_f, 'wb')
submission.to_csv(submission_fh, sep=',', header='true')
submission_fh.close()

## Historical Records

## Resources