In [20]:
import pandas as pd
from pyproj import Proj
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
import seaborn as sns
sns.set(color_codes=True)

%matplotlib inline

In [2]:
data_path = "data/merged/manhattan_brooklyn_2003_2016.csv"
df = pd.read_csv(data_path, low_memory = True, error_bad_lines=False)
df.head()

Unnamed: 0,zipcode,ltdheight,splitzone,easements,comarea,resarea,numbldgs,numfloors,unitsres,unitstotal,...,proxcode_1.0,proxcode_2.0,lottype_0.0,lottype_1.0,lottype_2.0,lottype_3.0,lottype_4.0,lottype_5.0,tax_class_at_time_of_sale_1,tax_class_at_time_of_sale_2
0,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.shape

(1682560, 137)

In [4]:
df.columns.values

array(['zipcode', 'ltdheight', 'splitzone', 'easements', 'comarea',
       'resarea', 'numbldgs', 'numfloors', 'unitsres', 'unitstotal',
       'lotfront', 'lotdepth', 'bldgfront', 'bldgdepth', 'irrlotcode',
       'bsmtcode', 'yearbuilt', 'builtcode', 'histdist', 'landmark',
       'condono', 'xcoord', 'ycoord', 'zonemap', 'latitude', 'longitude',
       'gross_sqft_pluto', 'garage', 'extension', 'countalter',
       'sale_price', 'sale_date', 'year_built', 'residential_units',
       'commercial_units', 'total_units', 'price_per_sqft',
       'schooldist_mv', 'council_mv', 'zipcode_mv', 'ownertype_mv',
       'numbldgs_mv', 'unitsres_mv', 'unitstotal_mv', 'lotfront_mv',
       'lotdepth_mv', 'bldgfront_mv', 'bldgdepth_mv', 'proxcode_mv',
       'yearbuilt_mv', 'xcoord_mv', 'ycoord_mv', 'zonemap_mv',
       'latitude_mv', 'longitude_mv', 'borough_BK', 'schooldist_1.0',
       'schooldist_2.0', 'schooldist_3.0', 'schooldist_4.0',
       'schooldist_5.0', 'schooldist_6.0', 'schooldist_1

In [16]:
df.isnull().sum()

zipcode                            484
ltdheight                            0
splitzone                            0
easements                            0
comarea                              0
resarea                              0
numbldgs                         45424
numfloors                            0
unitsres                        754808
unitstotal                      898264
lotfront                       1003012
lotdepth                        354380
bldgfront                       532432
bldgdepth                       153408
irrlotcode                           0
bsmtcode                             0
yearbuilt                          132
builtcode                            0
histdist                             0
landmark                             0
condono                              0
xcoord                            9832
ycoord                            9832
zonemap                             84
latitude                          9832
longitude                

In [9]:
df['zonemap'].unique()

array(['12b', '12a', '12d', '12c', '8b', '8d', '8c', '5d', '9b', '9a',
       '6b', '6a', '5c', '3b', '3a', '1b', '16c', '16a', '16d', '16b',
       '22a', '22c', '17a', '17b', '17c', '13b', '13a', '13d', '17d',
       '18b', '23a', '22d', nan, '22b', '28c', '28a', '23b', '28b', '28d',
       '29a', '23c', '23d', '29b'], dtype=object)

In [13]:
def drop_cols(data, cols):
    return data.drop(cols, axis = 1)

In [18]:
#df = drop_cols(df, ['zonemap'])
df = drop_cols(df, ['sale_date'])

In [22]:
def split_data(data):
    '''
    Splits data into training and test sets (0.8/0.2)
        Args: 
            data: Pandas dataframe
        Returns:
            data_train: Pandas dataframe used for training
            data_test: Pandas dataframe used for testing
    
    '''
    #Convert 'int64' into float; otherwise, sklearn throws a warning message
    columns = data.columns.values
    non_float = []
    for col in columns:
        if data[col].dtype != np.float64:
            non_float.append(col)
        for col in non_float:
            data[col] = data[col].astype(float)
    #drop NaN for crucial columns
    data= data.dropna(how = 'any', subset = ['latitude','longitude','price_per_sqft'])   
    #Split the data
    split = cross_validation.ShuffleSplit(data.shape[0], n_iter=1, train_size = 0.7, test_size=.3, random_state = 1)

    for train, test in split:
        train_index = train
        test_index = test
    data_train = data.ix[train_index,:]
    data_test = data.ix[test_index,:]
    data_train.reset_index(drop=True, inplace=True)
    data_test.reset_index(drop=True, inplace=True)
    return data_train, data_test

In [21]:
data_train, data_test = split_data(df)

In [23]:
def fill_na(data_train, data_test):
    '''
    Fills NaN values with the mean of the column. Note we have already created dummy variables
    for columns with missing values.
    
    Args:
        data_train: Pandas dataframe used for training.
        data_test: Pandas dataframe used for testing.
    Returns:
        data_train: Pandas dataframe with no NaN values, ready for modeling.
        data_test: Pandas dataframe with no NaN values, ready for testing.
    
    '''
    data_train = data_train.apply(lambda x: x.fillna(x.mean()),axis=0)
    data_test = data_test.apply(lambda x: x.fillna(x.mean()),axis=0)
    return data_train, data_test

In [24]:
data_train, data_test = fill_na(data_train, data_test)

In [26]:
print(data_train.shape, data_test.shape)


(1170909, 135) (501819, 135)


In [28]:
data_test.isnull().sum()

zipcode                        0
ltdheight                      0
splitzone                      0
easements                      0
comarea                        0
resarea                        0
numbldgs                       0
numfloors                      0
unitsres                       0
unitstotal                     0
lotfront                       0
lotdepth                       0
bldgfront                      0
bldgdepth                      0
irrlotcode                     0
bsmtcode                       0
yearbuilt                      0
builtcode                      0
histdist                       0
landmark                       0
condono                        0
xcoord                         0
ycoord                         0
latitude                       0
longitude                      0
gross_sqft_pluto               0
garage                         0
extension                      0
countalter                     0
sale_price                     0
          

In [29]:
cols = list(data_train.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('price_per_sqft')) #Remove b from list
data_train = data_train[cols+['price_per_sqft']]
data_test = data_test[cols+['price_per_sqft']]
data_train.columns.values

array(['zipcode', 'ltdheight', 'splitzone', 'easements', 'comarea',
       'resarea', 'numbldgs', 'numfloors', 'unitsres', 'unitstotal',
       'lotfront', 'lotdepth', 'bldgfront', 'bldgdepth', 'irrlotcode',
       'bsmtcode', 'yearbuilt', 'builtcode', 'histdist', 'landmark',
       'condono', 'xcoord', 'ycoord', 'latitude', 'longitude',
       'gross_sqft_pluto', 'garage', 'extension', 'countalter',
       'sale_price', 'year_built', 'residential_units', 'commercial_units',
       'total_units', 'schooldist_mv', 'council_mv', 'zipcode_mv',
       'ownertype_mv', 'numbldgs_mv', 'unitsres_mv', 'unitstotal_mv',
       'lotfront_mv', 'lotdepth_mv', 'bldgfront_mv', 'bldgdepth_mv',
       'proxcode_mv', 'yearbuilt_mv', 'xcoord_mv', 'ycoord_mv',
       'zonemap_mv', 'latitude_mv', 'longitude_mv', 'borough_BK',
       'schooldist_1.0', 'schooldist_2.0', 'schooldist_3.0',
       'schooldist_4.0', 'schooldist_5.0', 'schooldist_6.0',
       'schooldist_10.0', 'schooldist_13.0', 'schooldist_14.0'

In [35]:
X_train = data_train.ix[:,:-1]
y_train = data_train.ix[:,-1]
X_test = data_test.ix[:,:-1]
y_test = data_test.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
mse = mean_squared_error(y_test, regr.predict(X_test))
print('Mean_squared_error', mse)

Mean_squared_error 4449.71277035


In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_reg_final = RandomForestRegressor(n_estimators=100, n_jobs = -1)
RF_reg_final.fit(X_train, y_train)
print(mean_squared_error(y_test, RF_reg_final.predict(X_test)))

In [36]:
feature_importance =  reg_final.feature_importances_
indices = np.argsort(feature_importance)[::-1][:27]

feature_dct = {}
# Print the feature ranking
print("Feature ranking:")

for f in range(27):
    feature_dct[data_test.ix[:,1:].columns.values[indices][f]] = feature_importance[indices[f]]
feature_dct

0          11.970041
1           1.681872
2           4.141328
3           4.192075
4          19.074175
5          28.484198
6          14.076897
7           1.914818
8           1.960947
9           1.302211
10          4.734991
11          0.000000
12          0.000000
13          3.629585
14          6.021634
15        119.642570
16          0.000000
17          0.878712
18          0.000000
19          3.796204
20         10.974093
21          1.270154
22          1.937691
23         13.008655
24          0.000000
25          0.000000
26          4.136718
27          0.000000
28          3.070437
29          0.000000
             ...    
501789      0.083408
501790      4.929431
501791     68.822373
501792      0.000000
501793     12.813184
501794      0.000573
501795     17.734468
501796      0.000573
501797      7.218324
501798      1.853343
501799      0.000000
501800     16.702609
501801      3.190973
501802      5.057638
501803      0.103581
501804      0.000000
501805      0