# Zillow Challenge

##  Data input

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numpy import seterr,isneginf,array
from datetime import datetime
from pandas import compat
from operator import itemgetter
from sklearn import tree
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.neighbors import KDTree
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from IPython.display import Image

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', 200)
pd.options.display.float_format = '{:20,.2f}'.format
compat.PY3 = False

def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    
    # Etrainport our decision tree to graphviz format
    dot_file = tree.export_graphviz(decision_tree, out_file='images/' + name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename='images/' + name + '.png')

In [2]:
train_f = "/Users/Jose/Desktop/Zillow/train_2016_v2.csv"
train = pd.read_csv(train_f, index_col = 'parcelid', parse_dates=['transactiondate'])
train['transactiondate'] = pd.to_datetime(train['transactiondate']).astype(int)

properties_f = "/Users/Jose/Desktop/Zillow/properties_2016.csv"
properties = pd.read_csv(properties_f, index_col = 'parcelid')


train = train.join(properties) 
train_bak = train  # backup 

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#### TEST MODE  #####
train = train_bak
train = train.sample(frac=.2)

##  Feature Analysis

In [167]:
features_complete = train.columns[list(~train.isnull().any().values)]
features_complete

Index([u'logerror', u'transactiondate', u'bathroomcnt', u'bedroomcnt', u'fips',
       u'latitude', u'longitude', u'propertycountylandusecode',
       u'propertylandusetypeid', u'rawcensustractandblock', u'regionidcounty',
       u'roomcnt', u'assessmentyear'],
      dtype='object')

In [168]:
train.dtypes

logerror                        float64
transactiondate                   int64
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
fireplacecnt                    float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64
hashottuborspa                   object
heatingorsystemtypeid           float64


##  Feature creation

In [45]:
features_created = set()

In [46]:
train.isnull().sum()

logerror                            0
transactiondate                     0
airconditioningtypeid           12416
architecturalstyletypeid        18004
basementsqft                    18049
bathroomcnt                         0
bedroomcnt                          0
buildingclasstypeid             18054
buildingqualitytypeid            6705
calculatedbathnbr                 231
decktypeid                      17901
finishedfloor1squarefeet        16686
calculatedfinishedsquarefeet      128
finishedsquarefeet12              939
finishedsquarefeet13            18050
finishedsquarefeet15            17340
finishedsquarefeet50            16686
finishedsquarefeet6             17964
fips                                0
fireplacecnt                    16097
fullbathcnt                       231
garagecarcnt                    11954
garagetotalsqft                 11954
hashottuborspa                  17562
heatingorsystemtypeid            6928
latitude                            0
longitude   

In [171]:
#################
#  regionidzip_avg
#################

def add_logerror_byregionidzip(df, cache):
    df = df.join(cache,on='regionidzip', rsuffix='_byregionidzip')

# create during execution
features_created.add('regionidzip_avg')
features_created.add('regionidzip')

In [172]:
#################
#  finishedsquarefeet[n]_t
#################

def add_finishedsquarefeet(df, feature):
    n = 100
    df[feature+'_t'] = np.floor(np.divide(df[feature], n))
    df[feature] = df[feature].fillna(-1)
    df[feature+'_t'] = df[feature+'_t'].fillna(-1)
    
def add_finishedsquarefeet12_t(df):
    add_finishedsquarefeet(df, 'finishedsquarefeet12')
def add_finishedsquarefeet13_t(df):
    add_finishedsquarefeet(df, 'finishedsquarefeet13')
def add_finishedsquarefeet15_t(df):
    add_finishedsquarefeet(df, 'finishedsquarefeet15')    
def add_finishedsquarefeet50_t(df):
    add_finishedsquarefeet(df, 'finishedsquarefeet50')
def add_finishedsquarefeet6_t(df):
    add_finishedsquarefeet(df, 'finishedsquarefeet6')

features_created.add('finishedsquarefeet12_t')
features_created.add('finishedsquarefeet13_t')
features_created.add('finishedsquarefeet15_t')
features_created.add('finishedsquarefeet50_t')
features_created.add('finishedsquarefeet6_t')

#Needed for future calculations, so keeping for now
features_created.add('finishedsquarefeet12')
features_created.add('finishedsquarefeet13')
features_created.add('finishedsquarefeet15')
features_created.add('finishedsquarefeet50')
features_created.add('finishedsquarefeet6')

In [4]:
train['taxrate'] = np.divide(train['taxamount'],train['taxvaluedollarcnt'])

In [6]:
train['taxrate'].describe()

count              18,054.00
mean                    0.02
std                     0.82
min                     0.00
25%                     0.01
50%                     0.01
75%                     0.01
max                   109.54
Name: taxrate, dtype: float64

In [44]:
train.query('logerror<-2 & logerror > -2.5')
#train.query('taxrate==.01')

Unnamed: 0_level_0,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,taxrate
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
14621371,-2.27,1460505600000000000,,,,2.0,2.0,,,2.0,,,1335.0,1335.0,,,,,6059.0,,2.0,,,,24.0,33615100.0,-117719000.0,,,,,,,34,266.0,,60590626.22,46098.0,1286.0,,96963.0,5.0,,,,,,,1969.0,1.0,,123518.0,234595.0,2015.0,111077.0,2468.84,,,60590626222004.0,0.01
14426239,-2.32,1465948800000000000,,,,2.5,4.0,,,2.5,,,2739.0,2739.0,,,,,6059.0,,2.0,2.0,632.0,,,33479443.0,-117683221.0,6002.0,,,,,,122,261.0,,60590423.11,17686.0,1286.0,,96961.0,8.0,,1.0,,,,,1978.0,2.0,,237532.0,830000.0,2015.0,592468.0,9399.26,,,60590423106004.0,0.01
14470669,-2.31,1470873600000000000,,,,2.5,4.0,,,2.5,,,2373.0,2373.0,,,,,6059.0,,2.0,2.0,438.0,True,,33576718.0,-117654172.0,5454.0,1.0,,,1.0,,122,261.0,,60590320.4,12773.0,1286.0,,96996.0,0.0,,1.0,,,,,1986.0,,,229283.0,653189.0,2015.0,423906.0,7755.82,,,60590320402000.0,0.01
13989753,-2.35,1464825600000000000,,,,2.0,4.0,,,2.0,,,1053.0,1053.0,,,,,6059.0,,2.0,1.0,396.0,,,33823340.0,-117960906.0,7260.0,,,,,,122,261.0,,60590877.01,16764.0,1286.0,275496.0,97023.0,7.0,,,,,,,1955.0,1.0,,27036.0,52055.0,2015.0,25019.0,983.62,,,60590877012002.0,0.02
14334592,-2.29,1465776000000000000,,,,2.0,3.0,,,2.0,,,2029.0,2029.0,,,,,6059.0,,2.0,2.0,400.0,,,33761934.0,-117793829.0,12257.0,1.0,,,,1.0,122,261.0,,60590756.06,40081.0,1286.0,,97005.0,7.0,,,,,,,1960.0,1.0,,125328.0,623987.0,2015.0,498659.0,7179.28,,,60590756063001.0,0.01
11839942,-2.32,1469145600000000000,1.0,,,3.0,2.0,,4.0,3.0,,,1409.0,1409.0,,,,,6037.0,,3.0,,,,2.0,34063100.0,-118155000.0,985.0,,,,,,010D,269.0,ALRPD*,60374819.02,50677.0,3101.0,,96533.0,0.0,,,,1.0,,,1980.0,,,173471.0,375015.0,2015.0,201544.0,4619.23,,,60374819021029.0,0.01
14648465,-2.19,1460419200000000000,,,,2.0,2.0,,,2.0,,,1188.0,1188.0,,,,,6059.0,,2.0,1.0,0.0,,24.0,33608700.0,-117746000.0,,,,,,,34,266.0,,60590626.49,46098.0,1286.0,,96963.0,5.0,,,,,,,1973.0,1.0,,89040.0,174691.0,2015.0,85651.0,3309.96,,,60590626491017.0,0.02
14621147,-2.3,1456704000000000000,,,,2.0,2.0,,,2.0,,,1057.0,1057.0,,,,,6059.0,,2.0,,,,24.0,33619000.0,-117719000.0,,,,,,,34,266.0,,60590626.22,46098.0,1286.0,,96963.0,4.0,,,,,,,1969.0,1.0,,74810.0,112706.0,2015.0,37896.0,1193.78,,,60590626221003.0,0.01
11320030,-2.33,1453161600000000000,1.0,,,4.0,4.0,,4.0,4.0,,,2704.0,2704.0,,,,,6037.0,,4.0,,,,2.0,34649638.0,-118257354.0,12044.0,,,,,,0100,261.0,LCC1-RA100,60379012.05,5534.0,3101.0,,97319.0,0.0,,,,1.0,,,1990.0,,,221300.0,281000.0,2015.0,59700.0,4701.65,,,60379012054119.0,0.02
14012870,-2.27,1463443200000000000,,,,1.5,4.0,,,1.5,,,1747.0,1747.0,,,,,6059.0,,1.0,2.0,427.0,,,33811338.0,-118025508.0,6360.0,,,,,,122,261.0,,60591102.03,10608.0,1286.0,,96180.0,6.0,,1.0,,,,,1959.0,1.0,,90112.0,453051.0,2015.0,362939.0,5321.66,,,60591102032008.0,0.01


In [47]:
# Univariate - float

feature = 'roomcnt'
plt.xlabel(feature)
plt.ylabel('logerror')
plt.scatter(train[feature], 
            train['logerror'],
            c=train['logerror'],
            s=20, lw=0,vmin=-4, vmax=4,
           cmap='seismic')
plt.axis([ #0,.06,
        train[feature].min(), 
          train[feature].max(),
          train['logerror'].min(), 
            train['logerror'].max()
         ])

plt.show()

In [35]:
# Bivariate
feature1 = 'propertylandusetypeid'
feature2 = 'taxamount'


plt.xlabel(feature1)
plt.ylabel(feature2)
plt.scatter(train[feature1], 
            train[feature2],
            c=train['logerror'],
            s=20, lw=0,
           cmap='seismic',vmin=-4, vmax=4)
plt.axis([
        #train[feature1].min(), 
        240,
        train[feature1].max(),
        #      95000,99000, 0, .1,
             train[feature2].min(),  train[feature2].max(),
             #0,.1
         ])
plt.show()

Conclusions:
   * Average logerror = 0.011457219606756575
   * In general, data usually underestimates, with the etrainception of fewer high overestimates.
   * totalroomcnt:  ==0 is a mitrained bag.  != zero is overestimating

##  Feature selection

In [173]:
features_complete

Index([u'logerror', u'transactiondate', u'bathroomcnt', u'bedroomcnt', u'fips',
       u'latitude', u'longitude', u'propertycountylandusecode',
       u'propertylandusetypeid', u'rawcensustractandblock', u'regionidcounty',
       u'roomcnt', u'assessmentyear'],
      dtype='object')

In [174]:
features_to_keep = features_complete 
features_to_keep = features_to_keep.union(features_created)

In [175]:
features_to_keep

Index([u'assessmentyear', u'bathroomcnt', u'bedroomcnt',
       u'finishedsquarefeet12', u'finishedsquarefeet12_t',
       u'finishedsquarefeet13', u'finishedsquarefeet13_t',
       u'finishedsquarefeet15', u'finishedsquarefeet15_t',
       u'finishedsquarefeet50', u'finishedsquarefeet50_t',
       u'finishedsquarefeet6', u'finishedsquarefeet6_t', u'fips', u'latitude',
       u'logerror', u'longitude', u'propertycountylandusecode',
       u'propertylandusetypeid', u'rawcensustractandblock', u'regionidcounty',
       u'regionidzip', u'regionidzip_avg', u'roomcnt', u'transactiondate'],
      dtype='object')

In [157]:
Y = train[['logerror']]
X = train
for column in X.columns:
    if column not in features_to_keep:
        X = X.drop(column, axis=1)
        
#X = pd.DataFrame(scale(train, axis=0, with_mean=True, with_std=True, copy=True), columns = X.columns.values)

In [158]:
# TODO: qualifiers
X = X.drop('propertylandusetypeid', axis =1)
#X = X.drop('rawcensustractandblock', axis =1)
#X = X.drop('regionidcounty', axis =1)

In [159]:
X.describe()

Unnamed: 0,logerror,transactiondate,bathroomcnt,bedroomcnt,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,latitude,longitude,regionidzip,roomcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt
count,45138.0,45138.0,45138.0,45138.0,42769.0,18.0,1808.0,3431.0,199.0,45138.0,45138.0,45138.0,45124.0,45138.0,45138.0,45138.0,45138.0
mean,0.01,1.4656325302140052e+18,2.28,3.04,1748.58,1407.0,2389.88,1351.42,2307.68,6048.82,34006793.52,-118199189.42,96596.67,1.48,459704.2,2015.0,279624.96
std,0.16,7434938733608051.0,1.0,1.16,906.44,90.09,1099.09,658.94,1289.41,20.66,265460.8,359915.3,4056.91,2.82,569459.15,0.0,413006.66
min,-2.4,1.4516064e+18,0.0,0.0,2.0,1248.0,609.0,49.0,438.0,6037.0,33339295.0,-119447865.0,95982.0,0.0,7704.0,2015.0,1319.0
25%,-0.03,1.459728e+18,2.0,2.0,1178.0,1356.0,1673.5,938.0,1268.0,6037.0,33813046.75,-118411000.0,96193.0,0.0,198855.25,2015.0,81738.0
50%,0.01,1.4658624e+18,2.0,3.0,1524.0,1440.0,2108.5,1250.0,2052.0,6037.0,34023673.0,-118173156.0,96393.0,0.0,342872.0,2015.0,192913.0
75%,0.04,1.4714784e+18,3.0,4.0,2058.0,1440.0,2856.0,1624.0,3270.0,6059.0,34173100.0,-117922166.25,96987.0,0.0,541080.75,2015.0,345994.25
max,4.74,1.483056e+18,12.0,16.0,15973.0,1566.0,22741.0,8352.0,7224.0,6111.0,34816009.0,-117554924.0,399675.0,15.0,27750000.0,2015.0,24500000.0


In [160]:
Y.describe()

Unnamed: 0,logerror
count,45138.0
mean,0.01
std,0.16
min,-2.4
25%,-0.03
50%,0.01
75%,0.04
max,4.74


## Data cleaning

## Modeling

In [161]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.1)

In [162]:
# Cache: regionidzip -> mean(logerror)
_byzipcache = X_train.groupby(['regionidzip'])['logerror'].mean()

In [163]:
# Create new vars needed after train set is defined
add_logerror_byregionidzip(X_train, _byzipcache )
X_train = X_train.drop('logerror', axis = 1)
X_train = X_train.drop('regionidzip', axis = 1)
add_distanceavg(train)
add_finishedsquarefeet12_t(X_train)
X_train = X_train.drop('finishedsquarefeet12', axis = 1)
add_finishedsquarefeet13_t(X_train)
X_train = X_train.drop('finishedsquarefeet13', axis = 1)
add_finishedsquarefeet15_t(X_train)
X_train = X_train.drop('finishedsquarefeet15', axis = 1)
add_finishedsquarefeet50_t(X_train)
X_train = X_train.drop('finishedsquarefeet50', axis = 1)
add_finishedsquarefeet6_t(X_train)
X_train = X_train.drop('finishedsquarefeet6', axis = 1)

KeyboardInterrupt: 

In [142]:
add_logerror_byregionidzip(X_test, _byzipcache )
X_test = X_test.drop('logerror', axis = 1)
X_test = X_test.drop('regionidzip', axis = 1)
add_distanceavg(X_test)
add_finishedsquarefeet12_t(X_test)
X_test = X_test.drop('finishedsquarefeet12', axis = 1)
add_finishedsquarefeet13_t(X_test)
X_test = X_test.drop('finishedsquarefeet13', axis = 1)
add_finishedsquarefeet15_t(X_test)
X_test = X_test.drop('finishedsquarefeet15', axis = 1)
add_finishedsquarefeet50_t(X_test)
X_test = X_test.drop('finishedsquarefeet50', axis = 1)
add_finishedsquarefeet6_t(X_test)
X_test = X_test.drop('finishedsquarefeet6', axis = 1)

In [143]:
# Fit regression model
max_depth = None
min_samples_split = int(round(len(X_train)/50, 0))
min_samples_leaf = int(round(len(X_train)/10, 0))
print("min_samples_split =", min_samples_split)
print("min_samples_split =", min_samples_leaf)

model = DecisionTreeRegressor(max_depth=max_depth, 
                              min_samples_split = min_samples_split,
                              min_samples_leaf = min_samples_leaf,
                              criterion = "mae")
model.fit(X_train, Y_train)

('min_samples_split =', 90)
('min_samples_split =', 451)


DecisionTreeRegressor(criterion='mae', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=451,
           min_samples_split=90, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [144]:
Y_predict=model.predict(X_test)

In [145]:
# (<= value) left = false, right = true
zip(X.columns[model.tree_.feature], model.tree_.threshold, model.tree_.children_left, model.tree_.children_right)

[('finishedsquarefeet6', 26.5, 1, 14),
 ('logerror', 1.4726447799960863e+18, 2, 13),
 ('finishedsquarefeet12', 34234048.0, 3, 12),
 ('logerror', 1.456401557279998e+18, 4, 5),
 ('roomcnt', -2.0, -1, -1),
 ('logerror', 1.4623631093257994e+18, 6, 7),
 ('roomcnt', -2.0, -1, -1),
 ('logerror', 1.4697072972435292e+18, 8, 11),
 ('finishedsquarefeet12', 33979432.0, 9, 10),
 ('roomcnt', -2.0, -1, -1),
 ('roomcnt', -2.0, -1, -1),
 ('roomcnt', -2.0, -1, -1),
 ('roomcnt', -2.0, -1, -1),
 ('roomcnt', -2.0, -1, -1),
 ('roomcnt', -2.0, -1, -1)]

## Model evaluation

In [146]:
# Zillow MAE (test)
print("MAE: ",metrics.mean_absolute_error(Y_test['logerror'], Y_predict))

('MAE: ', 0.068615931076923084)


## Model Execution

In [469]:
sample_submission_f = '/Users/Jose/Desktop/Zillow/sample_submission.csv'
submission = pd.read_csv(sample_submission_f, index_col='ParcelId')

In [470]:
#  ATTEMPT 2 - 
logerroravg = train['logerror'].mean()
print("Using logerroravg = ", logerroravg)

('Using logerroravg = ', 0.011457219606756525)


In [546]:
X_all = properties

In [548]:
# Add features

def add_transactiondate(df):
    df['transactiondate'] = pd.datetime(2016,10,1)
    df['transactiondate'] = df['transactiondate'].astype(int)

print("Calculating finishedsquarefeet12_t")
add_finishedsquarefeet12_t(X_all)
print("Calculating finishedsquarefeet13_t")
add_finishedsquarefeet13_t(X_all)
print("Calculating finishedsquarefeet15_t")
add_finishedsquarefeet15_t(X_all)
print("Calculating finishedsquarefeet50_t")
add_finishedsquarefeet50_t(X_all)
print("Calculating finishedsquarefeet6_t")
add_finishedsquarefeet6_t(X_all)
print("Calculating regionidzip_avg")
add_regionidzip_avg(X_all)
print("Calculating transactiondate")
add_transactiondate(X_all)

KeyboardInterrupt: 

In [None]:
# Expensive calc
print("Calculating distanceavg")
zillow.add_distanceavg(X_all)

In [None]:
for column in X_all.columns:
    if column not in features_to_keep:
        X_all = X_all.drop(column, axis=1)

In [547]:
#  Update data
print("Missing: ",X_test.columns.difference( X_all.columns))

('Missing: ', Index([u'distanceavg', u'finishedsquarefeet12_t', u'finishedsquarefeet13_t',
       u'finishedsquarefeet15_t', u'finishedsquarefeet50_t',
       u'finishedsquarefeet6_t', u'regionidzip_avg', u'transactiondate'],
      dtype='object'))


  from ipykernel import kernelapp as app


In [534]:
Y_all = model.predict(X_all)

In [None]:
results = pd.DataFrame(index=X_all.index)
results.index.names = ['ParcelId']
results['201610'] = Y_all
results['201611'] = Y_all
results['201612'] = Y_all
results['201710'] = Y_all
results['201711'] = Y_all
results['201712'] = Y_all

In [None]:
submission = submission.drop(submission.columns[0:], axis=1)
submission = submission.join(results)

# Use average for properties with missing data
submission.fillna(logerroravg, inplace=True)       
submission.columns

In [None]:
# Round as per rules
submission = submission.round(4)

## Sanity Checks

In [None]:
# Average log error
submission.describe().round(4)

In [None]:
# Check for NaN
submission.isnull().sum()

In [None]:
# Check if any duplicates
submission[submission.index.duplicated(keep=False)]

In [None]:
# Check additional values in submission file
submission[~submission.index.isin(properties.index)]

In [None]:
# Check additional values in properties file
properties[~properties.index.isin(submission.index)]

## Dump File

In [None]:
# Write file
submission_f = '/Users/Jose/Desktop/Zillow/submission.csv'
submission_fh = open(submission_f, 'wb')
submission.to_csv(submission_fh, sep=',', header='true')
submission_fh.close()

## Historical Records

## Resources

In [None]:
# bedroomcnt
#plt.trainlabel('bedroomcnt')
#plt.ylabel('logerror')
#plt.scatter(train['bathroomcnt'], train['logerror'], 
#           alpha=1, s=2, color='r')
#plt.atrainis([0, 10,-5, 5])
#plt.show()

#date
#plt.trainlabel('transactiondate')
#plt.ylabel('logerror')
#plt.scatter(train['bedroomcnt'], train['logerror'], alpha=1, s=2, color='r')
#plt.atrainis(['2016-01-01', '2017-01-01',-5, 5])
#plt.gcf().autofmt_traindate()
#plt.show()

#import matplotlib as mpl
#plt.trainlabel('longitude')
#plt.ylabel('latitude')
#plt.scatter(train['longitude'], train['latitude'], c=train['logerror'], s=20, lw=0, cmap='seismic')
#plt.atrainis([min(train['longitude']), matrain(train['longitude']),min(train['latitude']), matrain(train['latitude'])])
#plt.colorbar()
#plt.show()

#train2 = train.query('logerror < -2.')

#train2 = train.query('roomcnt == 0 ')

#train2 = train.query('bedroomcnt > 2')

#train2 = train.sample(frac=.10)

#plt.trainlabel('longitude')
#plt.ylabel('latitude')
#plt.scatter(train2['longitude'], train2['latitude'], c=train2['logerror'], s=20, lw=0, cmap='seismic')
#plt.colorbar()
#plt.show()