# Zillow Challenge

##  Data input

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numpy import seterr,isneginf,array
from datetime import datetime
from pandas import compat
from operator import itemgetter
from sklearn import tree
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.neighbors import KDTree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from IPython.display import Image

#import seaborn as sns

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', 200)
pd.options.display.float_format = '{:20,.2f}'.format
compat.PY3 = False

def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    
    # Etrainport our decision tree to graphviz format
    dot_file = tree.export_graphviz(decision_tree, out_file='images/' + name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    os.system("dot -T png images/" + name + ".dot -o images/" + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename='images/' + name + '.png')

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    #if ylim is not None:
    #    plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, scoring='neg_mean_absolute_error', train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [2]:
train_f = "train_2016_v2.csv"
train = pd.read_csv(train_f, index_col = 'parcelid', parse_dates=['transactiondate'])
train['transactiondate'] = pd.to_datetime(train['transactiondate']).astype(int)

properties_f = "properties_2016.csv"
properties = pd.read_csv(properties_f, index_col = 'parcelid')

train_bak = train.join(properties) 

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#### TEST MODE  #####
def getdata(pcnt): 
    return train_bak.sample(frac=pcnt)

In [4]:
train = getdata(1)

##  Feature creation

In [5]:
features_base = set()
features_created_custom = set()
features_created_null = set()

In [7]:
#################
#  taxrate
#################
def add_taxrate(dfs):
    features_created_custom.add('taxrate')
    dfs[0]['taxrate'] = np.divide(dfs[0]['taxamount'],dfs[0]['taxvaluedollarcnt'])

In [8]:
#################
# add_transactioncnt
#################

def add_transactioncnt(dfs, features, monthlag):
    name = 'transactioncnt'+str(monthlag)
    features_created_custom.add(name)
    n1 = 200  # calculatedfinishedsquarefeet
    n2 = 50000 # taxvaluedollarcnt
    
    dfs[0].drop([name], errors='ignore', axis=1, inplace = True)

    (dfs[0])['transactionmonth']    = pd.to_datetime((dfs[0])['transactiondate']).dt.month
    (dfs[0])['transactionmonthlag'] = np.subtract((dfs[0])['transactionmonth'],monthlag)    
    
    (dfs[0])['calculatedfinishedsquarefeet_t'] = np.ceil(np.divide((dfs[0])['calculatedfinishedsquarefeet'], n1))
    (dfs[0])['taxvaluedollarcnt_t'] = np.ceil(np.divide((dfs[0])['taxvaluedollarcnt'], n2))
    
    sumdf = pd.DataFrame({name : dfs[0].groupby(['transactionmonth']+features).size()}).reset_index()
    sumdf.dropna(axis=1, how='any', inplace=True)
    sumdf.rename(columns={'transactionmonth': 'transactionmonthlag'}, inplace=True)
    sumdf.set_index(['transactionmonthlag']+features, inplace=True)
    
    dfs[:] = [(dfs[0]).join(sumdf, on=['transactionmonthlag']+features)]
    (dfs[0]).drop(['transactionmonth',
                   'transactionmonthlag',
                   'calculatedfinishedsquarefeet_t',
                   'taxvaluedollarcnt_t'
                  ], axis=1, inplace = True)

In [9]:
#Binary
features_binary = {'hashottuborspa',
                'pooltypeid10',
                'pooltypeid2',
                'pooltypeid7',
                'fireplaceflag',
                'taxdelinquencyflag'}

# Types
features_types = {'airconditioningtypeid',
                  'architecturalstyletypeid',
                  'buildingqualitytypeid',
                  'buildingclasstypeid',
                  'decktypeid',
                  'heatingorsystemtypeid',
                  'propertycountylandusecode',
                  'propertylandusetypeid',
                  'propertyzoningdesc',
                  'typeconstructiontypeid',
                  'storytypeid'}

In [10]:
def createfeatures(dfs, features):
    functions = {
                'taxrate': add_taxrate,
                #'transactionmonth': add_transactionmonth,
                #'distanceavg':add_distanceavg                 
                } 
    for f in features:
        functions[f](dfs)
        
    features = ['bedroomcnt',
            'bathroomcnt',
            'taxvaluedollarcnt_t',
            'regionidzip'
           ]
    
    add_transactioncnt(dfs, features, 3)
    add_transactioncnt(dfs, features, 4)
    add_transactioncnt(dfs, features, 5)
    
    #dfs[0]['transactionmonth']    = pd.to_datetime((dfs[0])['transactiondate']).dt.month
    #dfs[:] = [dfs[0][dfs[0]['transactionmonth'] > 6]]
    #(dfs[0]).drop('transactionmonth', axis = 1, inplace=True)
    

In [11]:
dfs = [train]
# Add custom features
createfeatures(dfs,[
               'taxrate'
                #,'distanceavg'
              ])
#createfeaturesnull(train)
train = dfs[0]

for f in features_binary:
    train[f] = pd.Categorical(train[f]).codes

for f in features_types:
    train[f] = pd.Categorical(train[f]).codes




In [12]:
train = train.fillna(-1)
train_full = train

##  Feature selection

In [121]:
features_to_keep = set([
'logerror',
'transactiondate',
'bedroomcnt',
 'buildingqualitytypeid',
 'calculatedbathnbr',
 'calculatedfinishedsquarefeet',
 'censustractandblock',
 'finishedsquarefeet12',
 'fullbathcnt',
 'garagetotalsqft',
 'landtaxvaluedollarcnt',
 'latitude',
 'longitude',
 'lotsizesquarefeet',
 'poolcnt',
 'propertycountylandusecode',
 'propertylandusetypeid',
 'propertyzoningdesc',
 'rawcensustractandblock',
 'regionidcity',
 'regionidneighborhood',
 'regionidzip',
 'structuretaxvaluedollarcnt',
 'taxamount',
 'taxrate',
 'taxvaluedollarcnt',
 'threequarterbathnbr',
 'transactioncnt3',
 'transactioncnt4',
 'transactioncnt5',
 'yearbuilt'
        ])

In [122]:
train_filtered = train_full
for column in train_full.columns:
    if column not in features_to_keep:
        train_filtered = train_filtered.drop(column, axis=1)

In [123]:
train_filtered.columns

Index([u'logerror', u'transactiondate', u'bedroomcnt',
       u'buildingqualitytypeid', u'calculatedbathnbr',
       u'calculatedfinishedsquarefeet', u'finishedsquarefeet12',
       u'fullbathcnt', u'garagetotalsqft', u'latitude', u'longitude',
       u'lotsizesquarefeet', u'poolcnt', u'propertycountylandusecode',
       u'propertylandusetypeid', u'propertyzoningdesc',
       u'rawcensustractandblock', u'regionidcity', u'regionidneighborhood',
       u'regionidzip', u'threequarterbathnbr', u'yearbuilt',
       u'structuretaxvaluedollarcnt', u'taxvaluedollarcnt',
       u'landtaxvaluedollarcnt', u'taxamount', u'censustractandblock',
       u'taxrate', u'transactioncnt3', u'transactioncnt4', u'transactioncnt5'],
      dtype='object')

## Sample

In [124]:
def getsample(pcnt): 
    return train_filtered.sample(frac=pcnt)

In [137]:
train_sample = getsample(.2)

In [138]:
for column in train_sample.columns:
    if column not in features_to_keep:
        train_sample.drop(column, axis=1, inplace = True)

In [139]:
train_sample['transactionmonth'] = pd.to_datetime(train_sample['transactiondate']).dt.month
train_sample = train_sample[train_sample['transactionmonth'] > 6]
train_sample.drop(['transactionmonth', 'transactiondate'], axis=1, inplace=True)

In [128]:
Y = train_sample[['logerror']]
X = train_sample.drop('logerror',axis=1)

## Modeling

In [130]:
import math 

max_depth = None
min_samples_split = int(round(len(X)/6, 0))
min_samples_leaf = int(round(len(X)/60, 0))

print("min_samples_split =", min_samples_split)
print("min_samples_leaf =", min_samples_leaf)

#model = DecisionTreeRegressor(criterion = "mae",
#                             #min_samples_split = min_samples_split,
#                              min_samples_leaf = min_samples_leaf,
#                              )

n_jobs = int(len(X.columns)/2)
model = RandomForestRegressor(criterion = "mae",
                             #min_samples_split = min_samples_split,
                              min_samples_leaf = min_samples_leaf,
                              n_jobs = n_jobs,
                              n_estimators = n_jobs, warm_start = False
                              )

('min_samples_split =', 638)
('min_samples_leaf =', 63)


In [131]:
len(train_sample)

3829

In [132]:
# Fit regression model
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.9)

In [133]:
model.fit(X_train, Y_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=63, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=29, n_jobs=29,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [134]:
# Feature importance
sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), X_train.columns), 
             reverse=True)

[(0.1215, 'taxrate'),
 (0.1148, 'calculatedfinishedsquarefeet'),
 (0.1038, 'lotsizesquarefeet'),
 (0.0763, 'finishedsquarefeet12'),
 (0.075, 'yearbuilt'),
 (0.0689, 'taxamount'),
 (0.0665, 'structuretaxvaluedollarcnt'),
 (0.0557, 'regionidzip'),
 (0.0422, 'longitude'),
 (0.0354, 'latitude'),
 (0.0303, 'landtaxvaluedollarcnt'),
 (0.0284, 'censustractandblock'),
 (0.0254, 'rawcensustractandblock'),
 (0.0247, 'regionidcity'),
 (0.0235, 'propertyzoningdesc'),
 (0.0233, 'regionidneighborhood'),
 (0.0221, 'poolcnt'),
 (0.0185, 'taxvaluedollarcnt'),
 (0.0108, 'bedroomcnt'),
 (0.0086, 'buildingqualitytypeid'),
 (0.0052, 'transactioncnt3'),
 (0.0041, 'propertycountylandusecode'),
 (0.0039, 'propertylandusetypeid'),
 (0.0031, 'transactioncnt4'),
 (0.0025, 'garagetotalsqft'),
 (0.0022, 'fullbathcnt'),
 (0.002, 'transactioncnt5'),
 (0.0012, 'calculatedbathnbr'),
 (0.0, 'threequarterbathnbr')]

In [135]:
# Zillow MAE (test)
Y_predict=model.predict(X_test)
print("MAE: ",metrics.mean_absolute_error(Y_test['logerror'], Y_predict))

('MAE: ', 0.069170887728459529)


In [136]:
# Model statistics
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, Y.values.ravel(), cv=5, n_jobs = 5, scoring='neg_mean_absolute_error')
print(scores, scores.mean())

(array([-0.06700112, -0.06646446, -0.06498618, -0.06353472, -0.05960986]), -0.064319265201065567)


##  Learning Curve

In [None]:
title = "Learning Curves (DecisionTreeRegressor)"
# Cross validation with 5 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
plot_learning_curve(model, title, X, Y, ylim=(0.0, 1.01), cv=cv, n_jobs=25)
plt.show()

##  Feature / parameter search

In [28]:
len(X.columns)

59

In [34]:
# Feature search
from sklearn.feature_selection import RFE
rfe = RFE(model, n_features_to_select=15, step= 5)
fit = rfe.fit(X, Y.values.ravel())
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Num Features: 15
Selected Features: [False False False False  True  True  True  True  True  True  True  True
  True False False False False False False False False False  True False
  True False False False False False False False False False False False
 False False False False False False False False False  True False False
 False False False False  True False False False  True False  True]
Feature Ranking: [10  9  8  7  1  1  1  1  1  1  1  1  1  2  2  3  2  3  4  3  5  3  1  6  1
  7  7  8  9  9 10 10 10 10  9  9  8  8  8  7  7  3  4  4  4  1  4  5  5  5
  5  6  1  6  6  6  1  2  1]


In [35]:
X.columns[fit.support_]

Index([u'bedroomcnt', u'buildingclasstypeid', u'buildingqualitytypeid',
       u'calculatedbathnbr', u'decktypeid', u'finishedfloor1squarefeet',
       u'calculatedfinishedsquarefeet', u'finishedsquarefeet12',
       u'finishedsquarefeet13', u'latitude', u'lotsizesquarefeet',
       u'yearbuilt', u'taxamount', u'taxrate', u'distanceavg'],
      dtype='object')

In [414]:
# Parameter search
parameters = {#'max_depth':[], 
             'min_samples_leaf':[
                                len(X_train)/50,
                               # len(X_train)/40,
                               # len(X_train)/30
                                ],
              'min_samples_split':[
                                len(X_train)/6,
                               # len(X_train)/5,
                               # len(X_train)/3,
                                ]}
clf = GridSearchCV(model, parameters,n_jobs = 9)
clf.fit(X_train,Y_train)
sorted(clf.cv_results_.items())

[('mean_fit_time',
  array([ 8.54101006,  8.22041893,  7.45506032,  8.2014637 ,  7.9700431 ,
          7.32949098,  7.22526201,  6.77333744,  6.08810465])),
 ('mean_score_time',
  array([ 0.00123072,  0.00136709,  0.00117199,  0.00130169,  0.00114028,
          0.00107137,  0.00096138,  0.0010856 ,  0.00102401])),
 ('mean_test_score',
  array([ 0.00122942,  0.00107268,  0.00105931,  0.00082099,  0.00096201,
          0.00092878,  0.00081357,  0.00089703,  0.00074073])),
 ('mean_train_score',
  array([ 0.00253102,  0.00228528,  0.00176478,  0.00231324,  0.00217302,
          0.00172889,  0.0018033 ,  0.00149361,  0.00107885])),
 ('param_min_samples_leaf',
  masked_array(data = [324 324 324 406 406 406 541 541 541],
               mask = [False False False False False False False False False],
         fill_value = ?)),
 ('param_min_samples_split',
  masked_array(data = [2708 3249 5416 2708 3249 5416 2708 3249 5416],
               mask = [False False False False False False False False 

In [624]:
#Decision_Tree_Image(model, X_train.columns)

## Model evaluation

In [189]:
print ("Features: ", X.columns)
print ("Tree params: ")
print ("   max_depth ", max_depth)
print ("   min_samples_split ", min_samples_split)
print ("   min_samples_leaf ", min_samples_leaf)
print ("Result: ",metrics.mean_absolute_error(Y_test['logerror'], Y_predict))

('Features: ', Index([u'airconditioningtypeid', u'architecturalstyletypeid', u'basementsqft',
       u'bathroomcnt', u'bedroomcnt', u'buildingclasstypeid',
       u'buildingqualitytypeid', u'calculatedbathnbr', u'decktypeid',
       u'finishedfloor1squarefeet', u'calculatedfinishedsquarefeet',
       u'finishedsquarefeet12', u'finishedsquarefeet13',
       u'finishedsquarefeet15', u'finishedsquarefeet50',
       u'finishedsquarefeet6', u'fireplacecnt', u'fullbathcnt',
       u'garagecarcnt', u'garagetotalsqft', u'hashottuborspa',
       u'heatingorsystemtypeid', u'latitude', u'longitude',
       u'lotsizesquarefeet', u'poolcnt', u'poolsizesum', u'pooltypeid10',
       u'pooltypeid2', u'pooltypeid7', u'propertycountylandusecode',
       u'propertylandusetypeid', u'propertyzoningdesc',
       u'rawcensustractandblock', u'regionidcity', u'regionidcounty',
       u'regionidneighborhood', u'regionidzip', u'roomcnt', u'storytypeid',
       u'threequarterbathnbr', u'typeconstructiontypeid', u

## Model Execution

In [70]:
sample_submission_f = 'sample_submission.csv'
submission = pd.read_csv(sample_submission_f, index_col='ParcelId')

In [69]:
#  ATTEMPT 2 - 
logerroravg = train_bak['logerror'].mean()
print("Using logerroravg = ", logerroravg)

('Using logerroravg = ', 0.011457219606756575)


In [55]:
X_all = properties

In [56]:
# Prediction transaction date
def add_transactiondate(df):
    df['transactiondate'] = pd.datetime(2016,10,1)
    df['transactiondate'] = df['transactiondate'].astype(int)

add_transactiondate(X_all)

In [None]:
# Add features

dfs = [X_all]
# Add custom features
createfeatures(dfs,[
               'taxrate',
               #'distanceavg'
              ])
#createfeaturesnull(train)
X_all = dfs[0]

for f in features_binary:
    X_all[f] = pd.Categorical(X_all[f]).codes

for f in features_types:
    X_all[f] = pd.Categorical(X_all[f]).codes

X_all.fillna(-1, inplace=True)

In [61]:
features_to_keep

{u'bathroomcnt',
 u'buildingqualitytypeid',
 u'calculatedbathnbr',
 u'calculatedfinishedsquarefeet',
 u'latitude',
 u'logerror',
 u'longitude',
 u'pooltypeid2',
 u'pooltypeid7',
 u'propertycountylandusecode',
 u'propertylandusetypeid',
 u'propertyzoningdesc',
 u'taxdelinquencyflag',
 u'taxrate',
 u'transactioncnt3',
 u'transactioncnt4',
 u'transactioncnt5',
 u'transactioncnt6'}

In [62]:
for column in X_all.columns:
    if column not in features_to_keep:
        X_all.drop(column, axis=1, inplace=True)

In [63]:
X_all.columns

Index([u'bathroomcnt', u'buildingqualitytypeid', u'calculatedbathnbr',
       u'calculatedfinishedsquarefeet', u'latitude', u'longitude',
       u'pooltypeid2', u'pooltypeid7', u'propertycountylandusecode',
       u'propertylandusetypeid', u'propertyzoningdesc', u'taxdelinquencyflag',
       u'taxrate', u'transactioncnt3', u'transactioncnt4', u'transactioncnt5',
       u'transactioncnt6'],
      dtype='object')

In [64]:
#  Check for missing data
print("Missing: ",X_test.columns.difference( X_all.columns))

('Missing: ', Index([], dtype='object'))


In [65]:
Y_all = model.predict(X_all)

In [66]:
results = pd.DataFrame(index=X_all.index)
results.index.names = ['ParcelId']
results['201610'] = Y_all
results['201611'] = Y_all
results['201612'] = Y_all
results['201710'] = Y_all
results['201711'] = Y_all
results['201712'] = Y_all

In [71]:
submission = submission.drop(submission.columns[0:], axis=1)
submission = submission.join(results)

# Use average for properties with missing data
submission.fillna(logerroravg, inplace=True)       
submission.columns

Index([u'201610', u'201611', u'201612', u'201710', u'201711', u'201712'], dtype='object')

In [72]:
# Round as per rules
submission = submission.round(4)

## Sanity Checks

In [73]:
# Average log error
submission.describe().round(4)

Unnamed: 0,201610,201611,201612,201710,201711,201712
count,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0
mean,0.0,0.0,0.0,0.0,0.0,0.0
std,0.01,0.01,0.01,0.01,0.01,0.01
min,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.01,0.01,0.01,0.01,0.01,0.01
max,0.03,0.03,0.03,0.03,0.03,0.03


In [74]:
# Check for NaN
submission.isnull().sum()

201610    0
201611    0
201612    0
201710    0
201711    0
201712    0
dtype: int64

In [75]:
# Check if any duplicates
submission[submission.index.duplicated(keep=False)]

Unnamed: 0_level_0,201610,201611,201612,201710,201711,201712
ParcelId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [76]:
# Check additional values in submission file
submission[~submission.index.isin(properties.index)]

Unnamed: 0_level_0,201610,201611,201612,201710,201711,201712
ParcelId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [77]:
# Check additional values in properties file
properties[~properties.index.isin(submission.index)]

Unnamed: 0_level_0,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate,taxrate,transactionmonth,transactionmonthlag,calculatedfinishedsquarefeet_t
parcelid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1


## Dump File

In [78]:
# Write file
submission_f = 'submission.csv'
submission_fh = open(submission_f, 'wb')
submission.to_csv(submission_fh, sep=',', header='true')
submission_fh.close()

## Historical Records

## Resources

Conclusions:
   * Average logerror = 0.011457219606756575
   * In general, data usually underestimates, with the etrainception of fewer high overestimates.
   * totalroomcnt:  ==0 is a mitrained bag.  != zero is overestimating

In [None]:
# bedroomcnt
#plt.trainlabel('bedroomcnt')
#plt.ylabel('logerror')
#plt.scatter(train['bathroomcnt'], train['logerror'], 
#           alpha=1, s=2, color='r')
#plt.atrainis([0, 10,-5, 5])
#plt.show()

#date
#plt.trainlabel('transactiondate')
#plt.ylabel('logerror')
#plt.scatter(train['bedroomcnt'], train['logerror'], alpha=1, s=2, color='r')
#plt.atrainis(['2016-01-01', '2017-01-01',-5, 5])
#plt.gcf().autofmt_traindate()
#plt.show()

#import matplotlib as mpl
#plt.trainlabel('longitude')
#plt.ylabel('latitude')
#plt.scatter(train['longitude'], train['latitude'], c=train['logerror'], s=20, lw=0, cmap='seismic')
#plt.atrainis([min(train['longitude']), matrain(train['longitude']),min(train['latitude']), matrain(train['latitude'])])
#plt.colorbar()
#plt.show()

#train2 = train.query('logerror < -2.')

#train2 = train.query('roomcnt == 0 ')

#train2 = train.query('bedroomcnt > 2')

#train2 = train.sample(frac=.10)

#plt.trainlabel('longitude')
#plt.ylabel('latitude')
#plt.scatter(train2['longitude'], train2['latitude'], c=train2['logerror'], s=20, lw=0, cmap='seismic')
#plt.colorbar()
#plt.show()