# Fire up GraphLab Create

We always start with this line before using any part of GraphLab Create

In [1]:
import graphlab

# Load two tabular datasets
* The first dataset is almost 3 million properties in the Los Angeles area, over 50 features for each
* The other dataset is a subset of the 3 million properties that were sold, and the log-error of the difference between the predicted price and actual price of sale.

In [4]:
properties = graphlab.SFrame('properties_2016.csv')
p_with_error = graphlab.SFrame('train_2016.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,long,str,str,float,float,long,long,float,str,str,float,long,str,long,str,str,long,str,long,str,str,str,long,long,long,float,str,str,str,str,str,str,long,str,float,long,long,long,long,float,str,str,str,long,str,str,float,long,str,float,float,long,float,float,str,long,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,float,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [5]:
properties.show()

Canvas is accessible via web browser at the URL: http://localhost:52192/index.html
Opening Canvas in default web browser.


### Check out the first few lines of the log-error dataset

In [15]:
p_with_error #we can view first few lines of table

parcelid,logerror,transactiondate
11016594,0.0276,2016-01-01
14366692,-0.1684,2016-01-01
12098116,-0.004,2016-01-01
12643413,0.0218,2016-01-02
14432541,-0.005,2016-01-02
11509835,-0.2705,2016-01-02
12286022,0.044,2016-01-02
17177301,0.1638,2016-01-02
14739064,-0.003,2016-01-02
14677559,0.0843,2016-01-03


### Join features to log-errors and transaction dates
* Let's do an inner join of these two datasets, with the parcelid as the column to join on.  Our result will only contain the properties that were sold, but now they will have all the features of the first dataset and the second

In [3]:
features_plus_error = properties.join(p_with_error, on='parcelid',how='inner')

### Save some memory
* Now that we have our main dataset, let's free up some memory.  We can always load the csv files again later if needed.

In [4]:
del properties
del p_with_error

### Let's do some data cleaning, as annotated in activity log



In [5]:
features_plus_error = features_plus_error.fillna('airconditioningtypeid',0)

In [None]:
#saves our SFrame as a binary file, with the title 'features_with_error'
features_plus_error.save('features_with_error')

In [9]:
# we could delete the object, then reload it.  It means we can skip processing steps if we need to shut down.
del features_plus_error


In [2]:
features_plus_error = graphlab.SFrame('features_with_error')

This non-commercial license of GraphLab Create for academic use is assigned to harleyyesm@gmail.com and will expire on May 30, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\harleyjj\AppData\Local\Temp\graphlab_server_1497090905.log.0


In [3]:
#Using .apply() on only one column is much quicker than doing it on the whole SFrame, as shown in the course video
features_plus_error['airconditioningtypeid'] = features_plus_error['airconditioningtypeid'].apply(lambda x: str(x))

In [4]:
#opens another tab with some nice data exploration tools.
features_plus_error.show()

Canvas is accessible via web browser at the URL: http://localhost:50872/index.html
Opening Canvas in default web browser.


In [5]:
def clean(a):
    if a == '':
        return '0'
    return a

features_plus_error['architecturalstyletypeid'] = features_plus_error['architecturalstyletypeid'].apply(lambda x: clean(x))

In [6]:
none_as_string = ['basementsqft', 'decktypeid', 'finishedfloor1squarefeet',
                 'finishedsquarefeet13', 'finishedsquarefeet50', 
                 'finishedsquarefeet6', 'fireplacecnt', 'poolcnt',
                 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7',
                 'storytypeid', 'threequarterbathnbr', 
                 'typeconstructiontypeid', 'yardbuildingsqft17',
                 'yardbuildingsqft26', 'fireplaceflag', 'taxdelinquencyflag',
                 'censustractandblock']

In [7]:
for e in none_as_string:
    features_plus_error[e] = features_plus_error[e].apply(lambda x: clean(x))

In [8]:
features_plus_error['hashottuborspa'] = features_plus_error['hashottuborspa'].apply(lambda x: clean(x))

In [9]:
none_to_neg = ['bathroomcnt', 'bedroomcnt', 'roomcnt']
for e in none_to_neg:
    features_plus_error = features_plus_error.fillna(e,-1)

In [10]:
none_to_zero = ['buildingqualitytypeid', 'calculatedbathnbr', 
               'finishedsquarefeet12', 'finishedsquarefeet15',
               'fips', 'fullbathcnt', 'heatingorsystemtypeid', 'latitude',
               'longitude', 'lotsizesquarefeet', 'propertylandusetypeid',
               'rawcensustractandblock', 'regionidcity', 'regionidcounty',
               'regionidneighborhood', 'regionidzip', 'unitcnt', 
               'yearbuilt', 'numberofstories',
                'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
               'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount',
               'taxdelinquencyyear']
for e in none_to_zero:
    features_plus_error = features_plus_error.fillna(e, 0)

In [11]:
features_plus_error = features_plus_error.fillna('calculatedfinishedsquarefeet',0)
def none_to_x(val, x):
    if val == '':
        return x
    return val

features_plus_error['garagecarcnt'] = features_plus_error['garagecarcnt'].apply(lambda x: none_to_x(x,'-1'))
features_plus_error['garagetotalsqft'] = features_plus_error['garagetotalsqft'].apply(lambda x: none_to_x(x,'-1'))
features_plus_error['propertyzoningdesc'] = features_plus_error['propertyzoningdesc'].apply(lambda x: none_to_x(x,'NA'))

In [22]:
convert_to_int = ['basementsqft','bedroomcnt', 'finishedfloor1squarefeet',
                 'calculatedfinishedsquarefeet', 'finishedsquarefeet13',
                 'finishedsquarefeet50', 'finishedsquarefeet6',
                 'fireplacecnt', 'garagecarcnt', 'garagetotalsqft',
                 'lotsizesquarefeet', 'poolcnt', 'poolsizesum',
                 'threequarterbathnbr', 'yardbuildingsqft17',
                 'yardbuildingsqft26']

In [23]:
for e in convert_to_int:
    features_plus_error[e] = features_plus_error[e].apply(lambda x: int(x))

In [16]:
features_plus_error.remove_column('buildingclasstypeid')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.5,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.5,2,0
17109604,0,0,0,2.5,4,0
17125829,0,0,0,2.5,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.5,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

calculatedbathnbr,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet12,finishedsquarefeet13
2.5,0,548,1264,1264,0
1.0,0,777,777,777,0
2.0,0,1101,1101,1101,0
1.5,0,1554,1554,1554,0
2.5,0,1305,2415,2415,0
2.5,66,1303,2882,2882,0
2.0,0,1772,1772,1772,0
2.5,0,1240,2632,2632,0
2.0,0,1292,1292,1292,0
1.0,0,804,1385,1385,0

finishedsquarefeet15,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt
0,548,0,6111,0,2,2
0,777,0,6111,0,1,1
0,1101,0,6111,0,2,2
0,1554,0,6111,1,1,2
0,1305,0,6111,1,2,2
0,1303,0,6111,1,2,2
0,1772,0,6111,1,2,2
0,1240,0,6111,1,2,2
0,1292,0,6111,0,2,2
0,804,0,6111,1,1,1

garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum
0,0,0,34303597,-119287236,1735,0,0
0,0,0,34272866,-119198911,0,0,0
441,0,0,34340801,-119079610,6569,0,0
460,0,0,34354313,-119076405,7400,0,0
665,0,0,34266578,-119165392,6326,0,0
473,0,0,34240014,-119024793,10000,0,0
467,0,0,34226842,-119059815,8059,0,0
440,0,0,34229816,-119050224,7602,1,800
494,0,0,34226351,-118983853,7405,0,0
253,0,0,34179289,-119169287,6000,0,0

pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc
0,0,0,1128,265,
0,0,0,1129,266,
0,0,0,1111,261,
0,0,0,1110,261,
0,0,0,1111,261,
0,0,0,1111,261,
0,0,0,1111,261,
0,0,1,1111,261,
0,0,0,1111,261,
0,0,0,1111,261,

rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,...
61110022.003,34543,2061,0,97081,5.0,...
61110015.031,34543,2061,0,97083,4.0,...
61110007.011,26965,2061,0,97113,5.0,...
61110008.002,26965,2061,0,97113,5.0,...
61110014.021,34543,2061,0,97084,8.0,...
61110052.022,51239,2061,0,97089,8.0,...
61110055.032,51239,2061,0,97089,6.0,...
61110055.041,51239,2061,0,97089,8.0,...
61110053.041,51239,2061,0,97091,6.0,...
61110039.002,13150,2061,0,97104,6.0,...


In [17]:
convert_to_string = ['buildingqualitytypeid', 'fips', 'heatingorsystemtypeid',
                    'propertylandusetypeid', 'regionidcounty']

for e in convert_to_string:
    features_plus_error[e] = features_plus_error[e].apply(lambda x: str(x))

In [30]:
convert_to_binary = ['decktypeid', 'hashottuborspa', 'pooltypeid10',
                    'pooltypeid2', 'pooltypeid7', 'fireplaceflag', 
                    'assessmentyear', 'taxdelinquencyflag', 'storytypeid']

def to_binary(val):
    if val == '0':
        return False
    if val == 0:
        return False
    return True

for e in convert_to_binary:
    features_plus_error[e] = features_plus_error[e].apply(lambda x: to_binary(x))

In [35]:
features_plus_error.save('features_with_error')

In [10]:
features_plus_error.show()

Canvas is accessible via web browser at the URL: http://localhost:52192/index.html
Opening Canvas in default web browser.


In [9]:
b = features_plus_error['bathroomcnt']
c = features_plus_error['calculatedbathnbr']
f = features_plus_error['fullbathcnt']
t = features_plus_error['threequarterbathnbr']

def merge_baths(b, c, f, t):
    data = []
    for i in range(b.size()):
        if b[i] == -1:
            hi = max(b[i], c[i], (f[i] + 0.75*t[i]))
            if hi > 0:
                data.append(hi)
            else:
                data.append(-1)
        else:
            data.append(max(b[i], c[i], (f[i] + 0.75*t[i])))
    return graphlab.SArray(data=data,dtype=float)

features_plus_error['bathroomcnt'] = merge_baths(b, c, f, t)

In [11]:
features_plus_error.remove_column('calculatedbathnbr')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15
0,548,1264,1264,0,0
0,777,777,777,0,0
0,1101,1101,1101,0,0
0,1554,1554,1554,0,0
0,1305,2415,2415,0,0
1,1303,2882,2882,0,0
0,1772,1772,1772,0,0
0,1240,2632,2632,0,0
0,1292,1292,1292,0,0
0,804,1385,1385,0,0

finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa
548,0,6111,0,2,2,0,0
777,0,6111,0,1,1,0,0
1101,0,6111,0,2,2,441,0
1554,0,6111,1,1,2,460,0
1305,0,6111,1,2,2,665,0
1303,0,6111,1,2,2,473,0
1772,0,6111,1,2,2,467,0
1240,0,6111,1,2,2,440,0
1292,0,6111,0,2,2,494,0
804,0,6111,1,1,1,253,0

heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2
0,34303597,-119287236,1735,0,0,0,0
0,34272866,-119198911,0,0,0,0,0
0,34340801,-119079610,6569,0,0,0,0
0,34354313,-119076405,7400,0,0,0,0
0,34266578,-119165392,6326,0,0,0,0
0,34240014,-119024793,10000,0,0,0,0
0,34226842,-119059815,8059,0,0,0,0
0,34229816,-119050224,7602,1,800,0,0
0,34226351,-118983853,7405,0,0,0,0
0,34179289,-119169287,6000,0,0,0,0

pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity
0,1128,265,,61110022.003,34543
0,1129,266,,61110015.031,34543
0,1111,261,,61110007.011,26965
0,1110,261,,61110008.002,26965
0,1111,261,,61110014.021,34543
0,1111,261,,61110052.022,51239
0,1111,261,,61110055.032,51239
1,1111,261,,61110055.041,51239
0,1111,261,,61110053.041,51239
0,1111,261,,61110039.002,13150

regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,...
2061,0,97081,5.0,0,...
2061,0,97083,4.0,0,...
2061,0,97113,5.0,0,...
2061,0,97113,5.0,0,...
2061,0,97084,8.0,0,...
2061,0,97089,8.0,0,...
2061,0,97089,6.0,0,...
2061,0,97089,8.0,0,...
2061,0,97091,6.0,0,...
2061,0,97104,6.0,0,...


In [12]:
features_plus_error.remove_column('fullbathcnt')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15
0,548,1264,1264,0,0
0,777,777,777,0,0
0,1101,1101,1101,0,0
0,1554,1554,1554,0,0
0,1305,2415,2415,0,0
1,1303,2882,2882,0,0
0,1772,1772,1772,0,0
0,1240,2632,2632,0,0
0,1292,1292,1292,0,0
0,804,1385,1385,0,0

finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid
548,0,6111,0,2,0,0,0
777,0,6111,0,1,0,0,0
1101,0,6111,0,2,441,0,0
1554,0,6111,1,2,460,0,0
1305,0,6111,1,2,665,0,0
1303,0,6111,1,2,473,0,0
1772,0,6111,1,2,467,0,0
1240,0,6111,1,2,440,0,0
1292,0,6111,0,2,494,0,0
804,0,6111,1,1,253,0,0

latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode
34303597,-119287236,1735,0,0,0,0,0,1128
34272866,-119198911,0,0,0,0,0,0,1129
34340801,-119079610,6569,0,0,0,0,0,1111
34354313,-119076405,7400,0,0,0,0,0,1110
34266578,-119165392,6326,0,0,0,0,0,1111
34240014,-119024793,10000,0,0,0,0,0,1111
34226842,-119059815,8059,0,0,0,0,0,1111
34229816,-119050224,7602,1,800,0,0,1,1111
34226351,-118983853,7405,0,0,0,0,0,1111
34179289,-119169287,6000,0,0,0,0,0,1111

propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood
265,,61110022.003,34543,2061,0
266,,61110015.031,34543,2061,0
261,,61110007.011,26965,2061,0
261,,61110008.002,26965,2061,0
261,,61110014.021,34543,2061,0
261,,61110052.022,51239,2061,0
261,,61110055.032,51239,2061,0
261,,61110055.041,51239,2061,0
261,,61110053.041,51239,2061,0
261,,61110039.002,13150,2061,0

regionidzip,roomcnt,storytypeid,threequarterbathnbr,...
97081,5.0,0,1,...
97083,4.0,0,0,...
97113,5.0,0,0,...
97113,5.0,0,1,...
97084,8.0,0,1,...
97089,8.0,0,1,...
97089,6.0,0,0,...
97089,8.0,0,1,...
97091,6.0,0,0,...
97104,6.0,0,0,...


In [13]:
features_plus_error.remove_column('threequarterbathnbr')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15
0,548,1264,1264,0,0
0,777,777,777,0,0
0,1101,1101,1101,0,0
0,1554,1554,1554,0,0
0,1305,2415,2415,0,0
1,1303,2882,2882,0,0
0,1772,1772,1772,0,0
0,1240,2632,2632,0,0
0,1292,1292,1292,0,0
0,804,1385,1385,0,0

finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid
548,0,6111,0,2,0,0,0
777,0,6111,0,1,0,0,0
1101,0,6111,0,2,441,0,0
1554,0,6111,1,2,460,0,0
1305,0,6111,1,2,665,0,0
1303,0,6111,1,2,473,0,0
1772,0,6111,1,2,467,0,0
1240,0,6111,1,2,440,0,0
1292,0,6111,0,2,494,0,0
804,0,6111,1,1,253,0,0

latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode
34303597,-119287236,1735,0,0,0,0,0,1128
34272866,-119198911,0,0,0,0,0,0,1129
34340801,-119079610,6569,0,0,0,0,0,1111
34354313,-119076405,7400,0,0,0,0,0,1110
34266578,-119165392,6326,0,0,0,0,0,1111
34240014,-119024793,10000,0,0,0,0,0,1111
34226842,-119059815,8059,0,0,0,0,0,1111
34229816,-119050224,7602,1,800,0,0,1,1111
34226351,-118983853,7405,0,0,0,0,0,1111
34179289,-119169287,6000,0,0,0,0,0,1111

propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood
265,,61110022.003,34543,2061,0
266,,61110015.031,34543,2061,0
261,,61110007.011,26965,2061,0
261,,61110008.002,26965,2061,0
261,,61110014.021,34543,2061,0
261,,61110052.022,51239,2061,0
261,,61110055.032,51239,2061,0
261,,61110055.041,51239,2061,0
261,,61110053.041,51239,2061,0
261,,61110039.002,13150,2061,0

regionidzip,roomcnt,storytypeid,typeconstructiontypeid,...
97081,5.0,0,0,...
97083,4.0,0,0,...
97113,5.0,0,0,...
97113,5.0,0,0,...
97084,8.0,0,0,...
97089,8.0,0,0,...
97089,6.0,0,0,...
97089,8.0,0,0,...
97091,6.0,0,0,...
97104,6.0,0,0,...


In [14]:
b = features_plus_error['finishedfloor1squarefeet']
c = features_plus_error['finishedsquarefeet50']
features_plus_error.save('features_with_error')

In [15]:
def first_floor(b,c):
    data = []
    for i in range(b.size()):
        data.append(max(b[i],c[i]))
    return graphlab.SArray(data=data,dtype=int)

features_plus_error['finishedfloor1squarefeet'] = first_floor(b,c)

In [17]:
features_plus_error.remove_column('finishedsquarefeet50')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15
0,548,1264,1264,0,0
0,777,777,777,0,0
0,1101,1101,1101,0,0
0,1554,1554,1554,0,0
0,1305,2415,2415,0,0
1,1303,2882,2882,0,0
0,1772,1772,1772,0,0
0,1240,2632,2632,0,0
0,1292,1292,1292,0,0
0,804,1385,1385,0,0

finishedsquarefeet6,fips,fireplacecnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid
0,6111,0,2,0,0,0
0,6111,0,1,0,0,0
0,6111,0,2,441,0,0
0,6111,1,2,460,0,0
0,6111,1,2,665,0,0
0,6111,1,2,473,0,0
0,6111,1,2,467,0,0
0,6111,1,2,440,0,0
0,6111,0,2,494,0,0
0,6111,1,1,253,0,0

latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode
34303597,-119287236,1735,0,0,0,0,0,1128
34272866,-119198911,0,0,0,0,0,0,1129
34340801,-119079610,6569,0,0,0,0,0,1111
34354313,-119076405,7400,0,0,0,0,0,1110
34266578,-119165392,6326,0,0,0,0,0,1111
34240014,-119024793,10000,0,0,0,0,0,1111
34226842,-119059815,8059,0,0,0,0,0,1111
34229816,-119050224,7602,1,800,0,0,1,1111
34226351,-118983853,7405,0,0,0,0,0,1111
34179289,-119169287,6000,0,0,0,0,0,1111

propertylandusetypeid,propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood
265,,61110022.003,34543,2061,0
266,,61110015.031,34543,2061,0
261,,61110007.011,26965,2061,0
261,,61110008.002,26965,2061,0
261,,61110014.021,34543,2061,0
261,,61110052.022,51239,2061,0
261,,61110055.032,51239,2061,0
261,,61110055.041,51239,2061,0
261,,61110053.041,51239,2061,0
261,,61110039.002,13150,2061,0

regionidzip,roomcnt,storytypeid,typeconstructiontypeid,unitcnt,...
97081,5.0,0,0,0,...
97083,4.0,0,0,0,...
97113,5.0,0,0,0,...
97113,5.0,0,0,0,...
97084,8.0,0,0,0,...
97089,8.0,0,0,0,...
97089,6.0,0,0,0,...
97089,8.0,0,0,0,...
97091,6.0,0,0,0,...
97104,6.0,0,0,0,...


In [19]:
c = features_plus_error['finishedsquarefeet6']
f = features_plus_error['finishedsquarefeet12']
t = features_plus_error['finishedsquarefeet13']
d = features_plus_error['finishedsquarefeet15']
e = features_plus_error['calculatedfinishedsquarefeet']

def structure(b,c,d,e,f,t):
    data = []
    for i in range(b.size()):
        data.append(max(b[i],c[i],d[i],e[i],f[i],t[i]))
    return graphlab.SArray(data=data,dtype=int)

features_plus_error['calculatedfinishedsquarefeet'] = structure(b,c,d,e,f,t)

In [21]:
features_plus_error.remove_column('finishedsquarefeet6')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet12,finishedsquarefeet13,finishedsquarefeet15
0,548,1264,1264,0,0
0,777,777,777,0,0
0,1101,1101,1101,0,0
0,1554,1554,1554,0,0
0,1305,2415,2415,0,0
1,1303,2882,2882,0,0
0,1772,1772,1772,0,0
0,1240,2632,2632,0,0
0,1292,1292,1292,0,0
0,804,1385,1385,0,0

fips,fireplacecnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude
6111,0,2,0,0,0,34303597,-119287236
6111,0,1,0,0,0,34272866,-119198911
6111,0,2,441,0,0,34340801,-119079610
6111,1,2,460,0,0,34354313,-119076405
6111,1,2,665,0,0,34266578,-119165392
6111,1,2,473,0,0,34240014,-119024793
6111,1,2,467,0,0,34226842,-119059815
6111,1,2,440,0,0,34229816,-119050224
6111,0,2,494,0,0,34226351,-118983853
6111,1,1,253,0,0,34179289,-119169287

lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid
1735,0,0,0,0,0,1128,265
0,0,0,0,0,0,1129,266
6569,0,0,0,0,0,1111,261
7400,0,0,0,0,0,1110,261
6326,0,0,0,0,0,1111,261
10000,0,0,0,0,0,1111,261
8059,0,0,0,0,0,1111,261
7602,1,800,0,0,1,1111,261
7405,0,0,0,0,0,1111,261
6000,0,0,0,0,0,1111,261

propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt
,61110022.003,34543,2061,0,97081,5.0
,61110015.031,34543,2061,0,97083,4.0
,61110007.011,26965,2061,0,97113,5.0
,61110008.002,26965,2061,0,97113,5.0
,61110014.021,34543,2061,0,97084,8.0
,61110052.022,51239,2061,0,97089,8.0
,61110055.032,51239,2061,0,97089,6.0
,61110055.041,51239,2061,0,97089,8.0
,61110053.041,51239,2061,0,97091,6.0
,61110039.002,13150,2061,0,97104,6.0

storytypeid,typeconstructiontypeid,unitcnt,yardbuildingsqft17,...
0,0,0,128,...
0,0,0,198,...
0,0,0,0,...
0,0,0,0,...
0,0,0,0,...
0,0,0,0,...
0,0,0,1045,...
0,0,0,180,...
0,0,0,304,...
0,0,0,0,...


In [22]:
features_plus_error.remove_column('finishedsquarefeet12')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet13,finishedsquarefeet15,fips
0,548,1264,0,0,6111
0,777,777,0,0,6111
0,1101,1101,0,0,6111
0,1554,1554,0,0,6111
0,1305,2415,0,0,6111
1,1303,2882,0,0,6111
0,1772,1772,0,0,6111
0,1240,2632,0,0,6111
0,1292,1292,0,0,6111
0,804,1385,0,0,6111

fireplacecnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet
0,2,0,0,0,34303597,-119287236,1735
0,1,0,0,0,34272866,-119198911,0
0,2,441,0,0,34340801,-119079610,6569
1,2,460,0,0,34354313,-119076405,7400
1,2,665,0,0,34266578,-119165392,6326
1,2,473,0,0,34240014,-119024793,10000
1,2,467,0,0,34226842,-119059815,8059
1,2,440,0,0,34229816,-119050224,7602
0,2,494,0,0,34226351,-118983853,7405
1,1,253,0,0,34179289,-119169287,6000

poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid
0,0,0,0,0,1128,265
0,0,0,0,0,1129,266
0,0,0,0,0,1111,261
0,0,0,0,0,1110,261
0,0,0,0,0,1111,261
0,0,0,0,0,1111,261
0,0,0,0,0,1111,261
1,800,0,0,1,1111,261
0,0,0,0,0,1111,261
0,0,0,0,0,1111,261

propertyzoningdesc,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt
,61110022.003,34543,2061,0,97081,5.0
,61110015.031,34543,2061,0,97083,4.0
,61110007.011,26965,2061,0,97113,5.0
,61110008.002,26965,2061,0,97113,5.0
,61110014.021,34543,2061,0,97084,8.0
,61110052.022,51239,2061,0,97089,8.0
,61110055.032,51239,2061,0,97089,6.0
,61110055.041,51239,2061,0,97089,8.0
,61110053.041,51239,2061,0,97091,6.0
,61110039.002,13150,2061,0,97104,6.0

storytypeid,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,...
0,0,0,128,0,...
0,0,0,198,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,1045,0,...
0,0,0,180,0,...
0,0,0,304,0,...
0,0,0,0,0,...


In [23]:
features_plus_error.remove_column('finishedsquarefeet13')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,finishedsquarefeet15,fips,fireplacecnt,garagecarcnt
0,548,1264,0,6111,0,2
0,777,777,0,6111,0,1
0,1101,1101,0,6111,0,2
0,1554,1554,0,6111,1,2
0,1305,2415,0,6111,1,2
1,1303,2882,0,6111,1,2
0,1772,1772,0,6111,1,2
0,1240,2632,0,6111,1,2
0,1292,1292,0,6111,0,2
0,804,1385,0,6111,1,1

garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum
0,0,0,34303597,-119287236,1735,0,0
0,0,0,34272866,-119198911,0,0,0
441,0,0,34340801,-119079610,6569,0,0
460,0,0,34354313,-119076405,7400,0,0
665,0,0,34266578,-119165392,6326,0,0
473,0,0,34240014,-119024793,10000,0,0
467,0,0,34226842,-119059815,8059,0,0
440,0,0,34229816,-119050224,7602,1,800
494,0,0,34226351,-118983853,7405,0,0
253,0,0,34179289,-119169287,6000,0,0

pooltypeid10,pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc
0,0,0,1128,265,
0,0,0,1129,266,
0,0,0,1111,261,
0,0,0,1110,261,
0,0,0,1111,261,
0,0,0,1111,261,
0,0,0,1111,261,
0,0,1,1111,261,
0,0,0,1111,261,
0,0,0,1111,261,

rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid
61110022.003,34543,2061,0,97081,5.0,0
61110015.031,34543,2061,0,97083,4.0,0
61110007.011,26965,2061,0,97113,5.0,0
61110008.002,26965,2061,0,97113,5.0,0
61110014.021,34543,2061,0,97084,8.0,0
61110052.022,51239,2061,0,97089,8.0,0
61110055.032,51239,2061,0,97089,6.0,0
61110055.041,51239,2061,0,97089,8.0,0
61110053.041,51239,2061,0,97091,6.0,0
61110039.002,13150,2061,0,97104,6.0,0

typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,...
0,0,128,0,1986.0,...
0,0,198,0,1990.0,...
0,0,0,0,1956.0,...
0,0,0,0,1965.0,...
0,0,0,0,1984.0,...
0,0,0,0,1980.0,...
0,0,1045,0,1978.0,...
0,0,180,0,1971.0,...
0,0,304,0,1979.0,...
0,0,0,0,1950.0,...


In [24]:
features_plus_error.remove_column('finishedsquarefeet15')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,fips,fireplacecnt,garagecarcnt,garagetotalsqft
0,548,1264,6111,0,2,0
0,777,777,6111,0,1,0
0,1101,1101,6111,0,2,441
0,1554,1554,6111,1,2,460
0,1305,2415,6111,1,2,665
1,1303,2882,6111,1,2,473
0,1772,1772,6111,1,2,467
0,1240,2632,6111,1,2,440
0,1292,1292,6111,0,2,494
0,804,1385,6111,1,1,253

hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10
0,0,34303597,-119287236,1735,0,0,0
0,0,34272866,-119198911,0,0,0,0
0,0,34340801,-119079610,6569,0,0,0
0,0,34354313,-119076405,7400,0,0,0
0,0,34266578,-119165392,6326,0,0,0
0,0,34240014,-119024793,10000,0,0,0
0,0,34226842,-119059815,8059,0,0,0
0,0,34229816,-119050224,7602,1,800,0
0,0,34226351,-118983853,7405,0,0,0
0,0,34179289,-119169287,6000,0,0,0

pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock
0,0,1128,265,,61110022.003
0,0,1129,266,,61110015.031
0,0,1111,261,,61110007.011
0,0,1110,261,,61110008.002
0,0,1111,261,,61110014.021
0,0,1111,261,,61110052.022
0,0,1111,261,,61110055.032
0,1,1111,261,,61110055.041
0,0,1111,261,,61110053.041
0,0,1111,261,,61110039.002

regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,typeconstructiontypeid
34543,2061,0,97081,5.0,0,0
34543,2061,0,97083,4.0,0,0
26965,2061,0,97113,5.0,0,0
26965,2061,0,97113,5.0,0,0
34543,2061,0,97084,8.0,0,0
51239,2061,0,97089,8.0,0,0
51239,2061,0,97089,6.0,0,0
51239,2061,0,97089,8.0,0,0
51239,2061,0,97091,6.0,0,0
13150,2061,0,97104,6.0,0,0

unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,...
0,128,0,1986.0,2,...
0,198,0,1990.0,1,...
0,0,0,1956.0,1,...
0,0,0,1965.0,1,...
0,0,0,1984.0,2,...
0,0,0,1980.0,2,...
0,1045,0,1978.0,1,...
0,180,0,1971.0,2,...
0,304,0,1979.0,1,...
0,0,0,1950.0,1,...


In [28]:
b = features_plus_error['lotsizesquarefeet']
c = features_plus_error['poolsizesum']
d = features_plus_error['yardbuildingsqft17']
f = features_plus_error['yardbuildingsqft26']

def lot_size(b,c,d,e,f):
    data = []
    for i in range(b.size()):
        data.append(max(b[i],c[i],d[i],e[i],f[i]))
    return graphlab.SArray(data=data,dtype=int)

features_plus_error['lotsizesquarefeet'] = lot_size(b,c,d,e,f)

In [30]:
b = features_plus_error['pooltypeid2']
d = features_plus_error['pooltypeid7']
e = features_plus_error['pooltypeid10']
f = features_plus_error['poolcnt']

def is_pool(b,c,d,e,f):
    data = []
    for i in range(b.size()):
        if b[i] > 0:
            data.append(1)
        elif c[i] > 0:
            data.append(1)
        elif d[i] > 0:
            data.append(1)
        elif e[i] > 0:
            data.append(1)
        elif f[i] > 0:
            data.append(1)
        else:
            data.append(0)
    return graphlab.SArray(data=data,dtype=int)

features_plus_error['poolcnt'] = is_pool(b,c,d,e,f)

### Make a new feature from existing data

In [33]:
features_plus_error.add_column(features_plus_error['logerror'].apply(lambda x: abs(x)), name='abslogerror')

parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid
17073783,0,0,0,2.75,3,0
17088994,0,0,0,1.0,2,0
17100444,0,0,0,2.0,3,0
17102429,0,0,0,1.75,2,0
17109604,0,0,0,2.75,4,0
17125829,0,0,0,2.75,4,0
17132911,0,0,0,2.0,3,0
17134926,0,0,0,2.75,5,0
17139988,0,0,0,2.0,3,0
17167359,0,0,0,1.0,3,0

decktypeid,finishedfloor1squarefeet,calculatedfinishedsquaref eet ...,fips,fireplacecnt,garagecarcnt,garagetotalsqft
0,548,1264,6111,0,2,0
0,777,777,6111,0,1,0
0,1101,1101,6111,0,2,441
0,1554,1554,6111,1,2,460
0,1305,2415,6111,1,2,665
1,1303,2882,6111,1,2,473
0,1772,1772,6111,1,2,467
0,1240,2632,6111,1,2,440
0,1292,1292,6111,0,2,494
0,804,1385,6111,1,1,253

hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10
0,0,34303597,-119287236,1735,0,0,0
0,0,34272866,-119198911,777,0,0,0
0,0,34340801,-119079610,6569,0,0,0
0,0,34354313,-119076405,7400,0,0,0
0,0,34266578,-119165392,6326,0,0,0
0,0,34240014,-119024793,10000,0,0,0
0,0,34226842,-119059815,8059,0,0,0
0,0,34229816,-119050224,7602,1,800,0
0,0,34226351,-118983853,7405,0,0,0
0,0,34179289,-119169287,6000,0,0,0

pooltypeid2,pooltypeid7,propertycountylandusecode,propertylandusetypeid,propertyzoningdesc,rawcensustractandblock
0,0,1128,265,,61110022.003
0,0,1129,266,,61110015.031
0,0,1111,261,,61110007.011
0,0,1110,261,,61110008.002
0,0,1111,261,,61110014.021
0,0,1111,261,,61110052.022
0,0,1111,261,,61110055.032
0,1,1111,261,,61110055.041
0,0,1111,261,,61110053.041
0,0,1111,261,,61110039.002

regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,typeconstructiontypeid
34543,2061,0,97081,5.0,0,0
34543,2061,0,97083,4.0,0,0
26965,2061,0,97113,5.0,0,0
26965,2061,0,97113,5.0,0,0
34543,2061,0,97084,8.0,0,0
51239,2061,0,97089,8.0,0,0
51239,2061,0,97089,6.0,0,0
51239,2061,0,97089,8.0,0,0
51239,2061,0,97091,6.0,0,0
13150,2061,0,97104,6.0,0,0

unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,...
0,128,0,1986.0,2,...
0,198,0,1990.0,1,...
0,0,0,1956.0,1,...
0,0,0,1965.0,1,...
0,0,0,1984.0,2,...
0,0,0,1980.0,2,...
0,1045,0,1978.0,1,...
0,180,0,1971.0,2,...
0,304,0,1979.0,1,...
0,0,0,1950.0,1,...


In [34]:
features_plus_error.show()

Canvas is accessible via web browser at the URL: http://localhost:52192/index.html
Opening Canvas in default web browser.
