In [93]:

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# import statsmodels.api as sm
%matplotlib inline 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn import metrics

from sklearn.model_selection import train_test_split 

from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier

import requests
import kdtree as KDTree
import pysal

In [94]:
df = pd.read_csv('condo_res_join_v1.csv')


In [95]:
# First, look for null values so we can dropna
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

CM_ID          67187
ST_NAME_SUF      380
UNIT_NUM       67156
LAND_SF           80
YR_BUILT         227
YR_REMOD       14489
GROSS_AREA       115
LIVING_AREA      115
NUM_FLOORS       399
TOTAL_RMS       5299
BDRMS           5299
FULL_BTH        5299
HALF_BTH        5299
BTH_STYLE       5299
BTH_STYLE2     51702
BTH_STYLE3     97579
KITCH_STYLE     5299
HEAT_TYP        5299
AC              5299
FPLACE          5299
INT_CND         5299
INT_FIN         5299
VIEW            5299
dtype: int64

In [96]:
# Let's dropna from select colums
print('cleaned.shape before dropping na:' + str(df.shape))
na_to_drop = ['ZIPCODE', 'LAND_SF', 'YR_BUILT']
df = df.dropna(subset=na_to_drop)
print('cleaned.shape after dropping na:' + str(df.shape))

cleaned.shape before dropping na:(131049, 37)
cleaned.shape after dropping na:(130775, 37)


In [97]:
# We need to replace 'na' values with '0' for CM_ID in order to convert that feature from float to int
df['CM_ID'].fillna('0',inplace=True)

In [98]:
# Convert floats to ints
features_to_convert_to_int = ['ZIPCODE', 'CM_ID', 'YR_BUILT']
df[features_to_convert_to_int] = df[features_to_convert_to_int].astype(int)

In [101]:
df_slice = df.iloc[:3000]
df_slice.shape
df_small_slice = df.iloc[:100]
df_small_slice.shape

(100, 37)

In [102]:
for index, row in df_small_slice.iterrows():
    ST_NUM = row.ST_NUM
    ST_NAME = row.ST_NAME.replace(" ", "+")
    ST_NAME_SUF = row.ST_NAME_SUF
    ZIPCODE = row.ZIPCODE
    print(index)
    print("ST_NUM: {}\nST_NAME: {}\nST_NAME_SUF: {}\nZIPCODE{}\n\n".format(row.ST_NUM, row.ST_NAME.replace(" ", "+"), row.ST_NAME_SUF, row.ZIPCODE))
    
    try:
        request = 'https://maps.googleapis.com/maps/api/geocode/json?address={0}+{1}+{2},+MA+{3}&key=AIzaSyBL2Hk9soguFZ18kAvVcWK20E4Q4sZm3-w'.format(ST_NUM, ST_NAME, ST_NAME_SUF, ZIPCODE)
        response = requests.get(request)
        resp_json_payload = response.json()
        
        lat = df_small_slice.loc[index,'lat'] = resp_json_payload['results'][0]['geometry']['location']['lat']
        lng = df_small_slice.loc[index,'lng'] = resp_json_payload['results'][0]['geometry']['location']['lng']

    except IndexError:
        pass


0
ST_NUM: 2
ST_NAME: BEAVER
ST_NAME_SUF: ST
ZIPCODE2108


1
ST_NUM: 104 A 104
ST_NAME: PUTNAM
ST_NAME_SUF: ST
ZIPCODE2128


2
ST_NUM: 197
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


3
ST_NUM: 199
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


4
ST_NUM: 201
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


5
ST_NUM: 203
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


6
ST_NUM: 205 207
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


7
ST_NUM: 209 211
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


8
ST_NUM: 213
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


9
ST_NUM: 215
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


10
ST_NUM: 217
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


11
ST_NUM: 219
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


12
ST_NUM: 221
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


13
ST_NUM: 223
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


14
ST_NUM: 225
ST_NAME: LEXINGTON
ST_NAME_SUF: ST
ZIPCODE2128


15
ST_NUM: 227
ST_NAME: LEXINGTON
ST_NAME_SU

In [103]:
df_small_slice['Coordinates'] = list(zip(df_small_slice.lat, df_small_slice.lng))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [104]:
#Write a formula to find the PPSF of the k nearest neighbors of a given property
def find_neighbors_point(point,dataframe,num_neighbors):
  if len(dataframe)<1:
      PPSFs = []
      for neighbor in range(num_neighbors):
          PPSFs.append(np.nan)
  else:
      coordinates = list(dataframe['Coordinates'])
      tree = KDTree(coordinates, distance_metric='Arc', radius=pysal.lib.cg.RADIUS_EARTH_KM)
      dists,neibs = tree.query((point), k=num_neighbors)
      PPSFs = []
      for neighbor in neibs:
          if np.isnan(neighbor):
              PPSF_value = np.nan
          else:
              row = dataframe.iloc[[neighbor]]
              PPSF_value = row['PPSF']
              PPSF_value = PPSF_value.values[0]
          PPSFs.append(PPSF_value)
  neighbor_values = pd.DataFrame({
      'PPSFs': PPSFs})
  neighbor_values = neighbor_values.transpose()
  return neighbor_values, dists

#Write a formula to find the k-nearest neighbors' PPSF and dist for each point in an entire dataframe
def find_neighbors_df(df_newer,df_older,num_neighbors=50):
  neighbor_PPSFs = []
  neighbor_dists = []
  for i in list(df_newer['Coordinates']):
      neighbors,dists = find_neighbors_point(i,df_older,num_neighbors)
      neighbor_PPSFs.append(neighbors)
      neighbor_dists.append(dists)
  added_columns = list(range(0, num_neighbors))
  added_columns_PPSF = [str(i)+'_PPSF' for i in added_columns]
  added_columns_dist = [str(i)+'_Dist' for i in added_columns]
  neighbor_PPSF_df = pd.concat(neighbor_PPSFs)
  neighbor_PPSF_df.columns = added_columns_PPSF
  neighbor_PPSF_df = neighbor_PPSF_df.reset_index()
  neighbor_dist_df = pd.DataFrame(neighbor_dists)
  neighbor_dist_df.columns = added_columns_dist
  neighbor_dist_df = neighbor_dist_df.reset_index()
  df_newer = df_newer.reset_index()
  df_newer = pd.concat([df_newer,neighbor_PPSF_df,neighbor_dist_df],axis=1)
  return df_newer

In [106]:
df_small_slice.tail()

Unnamed: 0.1,Unnamed: 0,PID,CM_ID,GIS_ID,ST_NUM,ST_NAME,ST_NAME_SUF,UNIT_NUM,ZIPCODE,PTYPE,...,HEAT_TYP,AC,FPLACE,INT_CND,INT_FIN,VIEW,PU,lat,lng,Coordinates
95,185,100126000,0,100126000,462,SARATOGA,ST,,2128,105,...,W,N,0.0,A,N,A,RES,42.379887,-71.027931,"(42.3798873, -71.0279315)"
96,186,100127000,0,100127000,460,SARATOGA,ST,,2128,105,...,S,N,0.0,A,N,A,RES,42.379866,-71.028024,"(42.3798661, -71.02802439999999)"
97,187,100128000,0,100128000,458,SARATOGA,ST,,2128,105,...,W,N,0.0,A,N,A,RES,42.379845,-71.028117,"(42.37984489999999, -71.0281172)"
98,188,100129000,0,100129000,456,SARATOGA,ST,,2128,105,...,W,N,0.0,A,N,A,RES,42.379869,-71.028256,"(42.3798688, -71.0282565)"
99,189,100130000,0,100130000,454,SARATOGA,ST,,2128,105,...,F,N,0.0,A,N,A,RES,42.379759,-71.028373,"(42.3797588, -71.0283726)"
