In [51]:
import pandas as pd
from pyproj import Proj
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error
import seaborn as sns
sns.set(color_codes=True)
import collections

%matplotlib inline

In [66]:
data_path = "data/merged/manhattan_brooklyn_2003_2016.csv"
df = pd.read_csv(data_path, low_memory = True, error_bad_lines=False)
df.head()

Unnamed: 0,zipcode,ltdheight,splitzone,easements,comarea,resarea,numbldgs,numfloors,unitsres,unitstotal,...,proxcode_1.0,proxcode_2.0,lottype_0.0,lottype_1.0,lottype_2.0,lottype_3.0,lottype_4.0,lottype_5.0,tax_class_at_time_of_sale_1,tax_class_at_time_of_sale_2
0,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10004.0,0.0,0.0,0.0,1888126.0,0.0,1.0,50.0,0.0,52.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.shape

(1682560, 137)

In [67]:
df[['gross_sqft_pluto','sale_price','price_per_sqft']].head()

Unnamed: 0,gross_sqft_pluto,sale_price,price_per_sqft
0,1888126.0,0,0.0
1,1888126.0,0,0.0
2,1888126.0,0,0.0
3,1888126.0,0,0.0
4,1888126.0,1,5.296257e-07


In [4]:
df.columns.values

array(['zipcode', 'ltdheight', 'splitzone', 'easements', 'comarea',
       'resarea', 'numbldgs', 'numfloors', 'unitsres', 'unitstotal',
       'lotfront', 'lotdepth', 'bldgfront', 'bldgdepth', 'irrlotcode',
       'bsmtcode', 'yearbuilt', 'builtcode', 'histdist', 'landmark',
       'condono', 'xcoord', 'ycoord', 'zonemap', 'latitude', 'longitude',
       'gross_sqft_pluto', 'garage', 'extension', 'countalter',
       'sale_price', 'sale_date', 'year_built', 'residential_units',
       'commercial_units', 'total_units', 'price_per_sqft',
       'schooldist_mv', 'council_mv', 'zipcode_mv', 'ownertype_mv',
       'numbldgs_mv', 'unitsres_mv', 'unitstotal_mv', 'lotfront_mv',
       'lotdepth_mv', 'bldgfront_mv', 'bldgdepth_mv', 'proxcode_mv',
       'yearbuilt_mv', 'xcoord_mv', 'ycoord_mv', 'zonemap_mv',
       'latitude_mv', 'longitude_mv', 'borough_BK', 'schooldist_1.0',
       'schooldist_2.0', 'schooldist_3.0', 'schooldist_4.0',
       'schooldist_5.0', 'schooldist_6.0', 'schooldist_1

In [65]:
#output distribution of target variable to visualize in Tableau
import csv
x = df['price_per_sqft']
count = collections.Counter(x.astype(int))
with open("price_per_sqft_counts.csv",'w') as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(['Price Per Sqft', 'Frequency'])
    for key, count in count.items():
        writer.writerow([key, count])

In [68]:
#output distribution of sale price to visualize in Tableau
import csv
x = df['sale_price']
count = collections.Counter(x.astype(int))
with open("sale_price_counts.csv",'w') as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(['Sale Price', 'Frequency'])
    for key, count in count.items():
        writer.writerow([key, count])

In [13]:
def drop_cols(data, cols):
    return data.drop(cols, axis = 1)

In [41]:
df = drop_cols(df, ['zonemap','sale_date','sale_price'])

In [22]:
def split_data(data):
    '''
    Splits data into training and test sets (0.8/0.2)
        Args: 
            data: Pandas dataframe
        Returns:
            data_train: Pandas dataframe used for training
            data_test: Pandas dataframe used for testing
    
    '''
    #Convert 'int64' into float; otherwise, sklearn throws a warning message
    columns = data.columns.values
    non_float = []
    for col in columns:
        if data[col].dtype != np.float64:
            non_float.append(col)
        for col in non_float:
            data[col] = data[col].astype(float)
    #drop NaN for crucial columns
    data= data.dropna(how = 'any', subset = ['latitude','longitude','price_per_sqft'])   
    #Split the data
    split = cross_validation.ShuffleSplit(data.shape[0], n_iter=1, train_size = 0.7, test_size=.3, random_state = 1)

    for train, test in split:
        train_index = train
        test_index = test
    data_train = data.ix[train_index,:]
    data_test = data.ix[test_index,:]
    data_train.reset_index(drop=True, inplace=True)
    data_test.reset_index(drop=True, inplace=True)
    return data_train, data_test

In [21]:
data_train, data_test = split_data(df)

In [23]:
def fill_na(data_train, data_test):
    '''
    Fills NaN values with the mean of the column. Note we have already created dummy variables
    for columns with missing values.
    
    Args:
        data_train: Pandas dataframe used for training.
        data_test: Pandas dataframe used for testing.
    Returns:
        data_train: Pandas dataframe with no NaN values, ready for modeling.
        data_test: Pandas dataframe with no NaN values, ready for testing.
    
    '''
    data_train = data_train.apply(lambda x: x.fillna(x.mean()),axis=0)
    data_test = data_test.apply(lambda x: x.fillna(x.mean()),axis=0)
    return data_train, data_test

In [24]:
data_train, data_test = fill_na(data_train, data_test)

In [26]:
print(data_train.shape, data_test.shape)


(1170909, 135) (501819, 135)


In [43]:
data_train = data_train.drop('sale_price', axis = 1)
data_test = data_test.drop('sale_price', axis = 1)

In [44]:
cols = list(data_train.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('price_per_sqft')) #Remove b from list
data_train = data_train[cols+['price_per_sqft']]
data_test = data_test[cols+['price_per_sqft']]
data_train.columns.values

array(['zipcode', 'ltdheight', 'splitzone', 'easements', 'comarea',
       'resarea', 'numbldgs', 'numfloors', 'unitsres', 'unitstotal',
       'lotfront', 'lotdepth', 'bldgfront', 'bldgdepth', 'irrlotcode',
       'bsmtcode', 'yearbuilt', 'builtcode', 'histdist', 'landmark',
       'condono', 'xcoord', 'ycoord', 'latitude', 'longitude',
       'gross_sqft_pluto', 'garage', 'extension', 'countalter',
       'year_built', 'residential_units', 'commercial_units',
       'total_units', 'schooldist_mv', 'council_mv', 'zipcode_mv',
       'ownertype_mv', 'numbldgs_mv', 'unitsres_mv', 'unitstotal_mv',
       'lotfront_mv', 'lotdepth_mv', 'bldgfront_mv', 'bldgdepth_mv',
       'proxcode_mv', 'yearbuilt_mv', 'xcoord_mv', 'ycoord_mv',
       'zonemap_mv', 'latitude_mv', 'longitude_mv', 'borough_BK',
       'schooldist_1.0', 'schooldist_2.0', 'schooldist_3.0',
       'schooldist_4.0', 'schooldist_5.0', 'schooldist_6.0',
       'schooldist_10.0', 'schooldist_13.0', 'schooldist_14.0',
       'scho

In [45]:
X_train = data_train.ix[:,:-1]
y_train = data_train.ix[:,-1]
X_test = data_test.ix[:,:-1]
y_test = data_test.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
mse = mean_squared_error(y_test, regr.predict(X_test))
print('Mean_squared_error', mse)

Mean_squared_error 4716.83862762


In [46]:
from sklearn.ensemble import RandomForestRegressor

RF_reg_final = RandomForestRegressor(n_estimators=100, n_jobs = -1)
RF_reg_final.fit(X_train, y_train)
print(mean_squared_error(y_test, RF_reg_final.predict(X_test)))

2673.50270801


In [48]:
feature_importance =  RF_reg_final.feature_importances_
indices = np.argsort(feature_importance)[::-1][:27]

feature_dct = {}
# Print the feature ranking
print("Feature ranking:")

for f in range(27):
    feature_dct[X_test.columns.values[indices][f]] = feature_importance[indices[f]]
feature_dct

Feature ranking:


{'bldgfront': 0.061238330581188175,
 'bldgfront_mv': 0.011983122235041999,
 'comarea': 0.05610111044608377,
 'commercial_units': 0.032096391392148767,
 'condono': 0.025413212692996838,
 'council_2.0': 0.018303308935461778,
 'council_4.0': 0.011591498534338216,
 'gross_sqft_pluto': 0.15144953070300904,
 'landuse_5.0': 0.0086171975496009169,
 'latitude': 0.021696832295874061,
 'longitude': 0.030056055912977615,
 'lotfront': 0.021939097752951373,
 'lottype_0.0': 0.012767928347856282,
 'lottype_3.0': 0.0086644810771093379,
 'numfloors': 0.030588722736663469,
 'resarea': 0.057642531418071859,
 'residential_units': 0.012141966648173012,
 'schooldist_2.0': 0.025565389004474616,
 'tax_class_at_time_of_sale_2': 0.027117159125555415,
 'unitsres': 0.0088046829339896264,
 'unitstotal': 0.17928879867873204,
 'unitstotal_mv': 0.0091588576097387346,
 'xcoord': 0.019190364209809813,
 'ycoord': 0.020615368256656358,
 'year_built': 0.028327133646944391,
 'yearbuilt': 0.024066312879649887,
 'zipcode': 0.