In [1]:
import pandas as pd
from pyproj import Proj
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from argparse import ArgumentParser
from final_data_clean import *
from merge_pluto_finance_new import *
from predict_price_increase import *

import seaborn as sns
sns.set(color_codes=True)
import warnings

In [2]:
def create_target_var(data, target_name):
    '''
    Separates X and y variables for data.
    Args:
        data: Pandas dataframe containing all data.
        target_name: column name of target variable
    Returns:
        X: Pandas dataframe with training features
        y: Target variable for X as Pandas series.
    '''
    # Convert int64 to float64
    data = data.astype(float)
    # Drop NaN for crucial columns
    data = data.replace({np.Inf:np.nan, -np.Inf:np.nan})
    data = data.dropna(how = 'any', subset = [target_name])
    # Split data into X and y
    X = data.drop(target_name, axis=1)
    y = data.loc[:,target_name]
    return X, y

In [3]:
def get_data_for_model(data_path = \
        'bronx_brooklyn_manhattan_queens_statenisland_2003_2016.csv'):
    df = pd.read_csv(data_path, low_memory = True)
    # drop columns that are not needed or are redundant
    df = drop_cols(df, ['sale_date', 'sale_price'])
    return df

In [4]:
def fit_RF(X_train, X_test, y_train, y_test):
    RF_reg_final = RandomForestRegressor(n_estimators=100, n_jobs = -1)
    RF_reg_final.fit(X_train, y_train)
    predicted = RF_reg_final.predict(X_test)
    percent_diff = 100*(np.abs(predicted - y_test).astype(float) / y_test)
    acc = 100 * (sum(i < 10. for i in percent_diff)/ len(percent_diff))
    print('Mean squared error for Random Forest model: ', mean_squared_error(y_test, RF_reg_final.predict(X_test)))
    print('\nAccuracy (within 10% of true value): ', acc)
    return RF_reg_final

In [5]:
data = get_data_for_model("data/merged/queens_2003_2016.csv")
X, y = create_target_var(data, 'price_per_sqft')
X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_test = fill_na(X_train, X_test)
X_train, X_test = normalize(X_train, X_test)
random_forest = fit_RF(X_train, X_test, y_train, y_test)



Mean squared error for Random Forest model:  17127.1274738

Accuracy (within 10% of true value):  27.3122133463


In [6]:
feature_importance =  random_forest.feature_importances_
indices = np.argsort(feature_importance)[::-1][:27]

feature_dct = {}
# Print the feature ranking
print("Feature ranking:")

for f in range(27):
    feature_dct[data.ix[:,1:].columns.values[indices][f]] = feature_importance[indices[f]]
feature_dct

Feature ranking:


{'bbl': 0.010076258581288021,
 'bldgclass_G': 0.017520486379875754,
 'bldgclass_N': 0.014502305336699321,
 'bldgclass_S': 0.0048488976423289431,
 'bldgclass_V': 0.0049907664721226992,
 'bldgdepth': 0.17060307888329804,
 'bsmtcode': 0.11907628659200706,
 'builtcode': 0.0078152940275057169,
 'comarea': 0.006845862332881111,
 'easements': 0.0069429897495633134,
 'extension': 0.010594219998793085,
 'histdist': 0.0065445577770142215,
 'irrlotcode': 0.14309928652490603,
 'landmark': 0.0088649464793439433,
 'latitude': 0.010757607393473421,
 'longitude': 0.01310177063661945,
 'lotfront': 0.0069933263118431947,
 'numbldgs': 0.095527006659086963,
 'numfloors': 0.019020032852378046,
 'out_of_school_youth_centers_dist_mv': 0.0049046382057501194,
 'resarea': 0.024536905878914488,
 'schooldist_25': 0.0095026853328420025,
 'schooldist_29': 0.0050866929096178093,
 'summer_youth_employment_centers_dist_mv': 0.0081286787768381479,
 'unitsres': 0.0080964346795241726,
 'unitstotal': 0.0047211113621057899

In [7]:
from collections import OrderedDict
from operator import itemgetter

feature_dct = OrderedDict(sorted(feature_dct.items(), key=itemgetter(1), reverse = True))

In [None]:
neg_features = feature_dct.copy()
pos_features = feature_dct.copy()

#Use correlation matrix to determine which features are negatively correlated with our target variable
negs = ['date','sentyr','district_74','monsex','state_CA','crimetype_immigration',
       'crime_9.0','crime_14.0','newcit','crimetype_drug - trafficking','district_70',
       'MLB_Allowed_GameNightBefore','MLB_Scored_GameNightBefore','neweduc_5',
       'state_AZ']
for key in neg_features.keys():
    if key in negs:
        neg_features[key] = -neg_features[key]
    else:
        neg_features[key] = 0
for key in pos_features.keys():
    if key in negs:
        pos_features[key] = 0