# ETL House Rent Index - Join All and Create Ratios

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
zillow_melt_rent = pd.read_csv('zillow_rent_value_index_zip_month.csv').drop('Unnamed: 0', axis=1)
display(zillow_melt_rent.head())
complaints_counts = pd.read_csv('grouped_311_noise v02.csv').drop('Unnamed: 0', axis=1)
display(complaints_counts.head())
zip_pop = pd.read_csv('population_zip_year.csv').drop('Unnamed: 0', axis=1)
display(zip_pop.head())
irs_income = pd.read_csv('irs_income_zip.csv').drop('Unnamed: 0', axis=1)
display(irs_income.head())

Unnamed: 0,zillow_zip,zillow_month_day,zillow_rent_value_index,zillow_month
0,10001,2015-03-31,3997.445534,2015-03
1,10001,2015-04-30,4027.333867,2015-04
2,10001,2015-05-31,4085.82208,2015-05
3,10001,2015-06-30,4131.364124,2015-06
4,10001,2015-07-31,4153.720059,2015-07


Unnamed: 0,incident_zip,month,year,borough,qty_complaints,0-6 hours,7-12 hours,13-18 hours,19-24 hours,21 Collection Truck Noise,...,Noise: Loud Music/Daytime (Mark Date And Time) (NN1),Noise: Loud Music/Nighttime(Mark Date And Time) (NP1),Noise: Manufacturing Noise (NK1),Noise: Other Noise Sources (Use Comments) (NZZ),Noise: Private Carting Noise (NQ1),Noise: Vehicle (NR2),Noise: air condition/ventilation equipment (NV1),Other,Passing By,People Created Noise
0,0,2018-04,2018,BROOKLYN,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,83,2013-08,2013,MANHATTAN,25,0,0,25,0,0,...,0,0,0,0,0,0,0,0,0,0
2,83,2013-09,2013,MANHATTAN,28,0,2,26,0,0,...,0,0,0,0,0,0,0,0,0,0
3,83,2013-10,2013,MANHATTAN,21,0,2,19,0,0,...,0,0,0,0,0,0,0,0,0,0
4,83,2013-11,2013,MANHATTAN,2,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,pop_zip_code,population,pop_year
0,601,18533.0,2011
1,602,41930.0,2011
2,603,54475.0,2011
3,606,6386.0,2011
4,610,29111.0,2011


Unnamed: 0,irs_zip_code,Total_income_A02650,Total_returns_N02650
0,0,139735700.0,2161240.0
1,35004,331183.0,5420.0
2,35005,139266.0,3440.0
3,35006,66755.0,1230.0
4,35007,776780.0,12600.0


In [3]:
#Extract year from zillow date
zillow_melt_rent['zillow_year'] = pd.DatetimeIndex(zillow_melt_rent['zillow_month_day']).year

In [4]:
#Fill williamsbourg missing population
for i in range(2010,2021):
    new_row = zip_pop[zip_pop['pop_zip_code']==11249].head(1)
    new_row['pop_year'] = i
    zip_pop = pd.concat([zip_pop,new_row], ignore_index=True)
zip_pop[zip_pop['pop_zip_code']==11249]

Unnamed: 0,pop_zip_code,population,pop_year
334107,11249,42421.0,2021
401025,11249,42421.0,2022
434799,11249,42421.0,2023
465690,11249,42421.0,2010
465691,11249,42421.0,2011
465692,11249,42421.0,2012
465693,11249,42421.0,2013
465694,11249,42421.0,2014
465695,11249,42421.0,2015
465696,11249,42421.0,2016


In [5]:
join_rent = pd.DataFrame()
join_rent = zillow_melt_rent.merge(complaints_counts,how='left', left_on=['zillow_zip','zillow_month'],right_on=['incident_zip','month'])
join_rent = join_rent.merge(zip_pop, how = 'left', left_on= ['zillow_zip','zillow_year'], right_on = ['pop_zip_code','pop_year'])
join_rent = join_rent.merge(irs_income, how = 'left', left_on= 'zillow_zip', right_on = 'irs_zip_code')
join_rent['Total_income_per_capita'] = join_rent['Total_income_A02650']/join_rent['population']
print(join_rent.shape)
join_rent.head()

(13490, 58)


Unnamed: 0,zillow_zip,zillow_month_day,zillow_rent_value_index,zillow_month,zillow_year,incident_zip,month,year,borough,qty_complaints,...,Other,Passing By,People Created Noise,pop_zip_code,population,pop_year,irs_zip_code,Total_income_A02650,Total_returns_N02650,Total_income_per_capita
0,10001,2015-03-31,3997.445534,2015-03,2015,10001.0,2015-03,2015.0,MANHATTAN,213.0,...,0.0,0.0,0.0,10001,23537.0,2015,10001.0,2830868.0,15590.0,120.273102
1,10001,2015-04-30,4027.333867,2015-04,2015,10001.0,2015-04,2015.0,MANHATTAN,217.0,...,1.0,0.0,0.0,10001,23537.0,2015,10001.0,2830868.0,15590.0,120.273102
2,10001,2015-05-31,4085.82208,2015-05,2015,10001.0,2015-05,2015.0,MANHATTAN,233.0,...,1.0,0.0,0.0,10001,23537.0,2015,10001.0,2830868.0,15590.0,120.273102
3,10001,2015-06-30,4131.364124,2015-06,2015,10001.0,2015-06,2015.0,MANHATTAN,218.0,...,0.0,0.0,0.0,10001,23537.0,2015,10001.0,2830868.0,15590.0,120.273102
4,10001,2015-07-31,4153.720059,2015-07,2015,10001.0,2015-07,2015.0,MANHATTAN,168.0,...,0.0,0.0,0.0,10001,23537.0,2015,10001.0,2830868.0,15590.0,120.273102


In [6]:
join_rent.dropna(subset=['zillow_rent_value_index'], inplace = True)
join_rent['qty_complaints'] = join_rent['qty_complaints'].fillna(value=0)
join_rent['income_group'] = pd.qcut(join_rent['Total_income_per_capita'],q=3, labels=["low","medium","high"])
join_rent['comp_ratio_1000_total'] = join_rent['qty_complaints'] / join_rent['population']*1000
join_rent['comp_ratio_1000_0-6 hours'] = join_rent['0-6 hours'] / join_rent['population']*1000
join_rent['comp_ratio_1000_7-12 hours'] = join_rent['7-12 hours'] / join_rent['population']*1000
join_rent['comp_ratio_1000_13-18 hours'] = join_rent['13-18 hours'] / join_rent['population']*1000
join_rent['comp_ratio_1000_19-24 hours'] = join_rent['19-24 hours'] / join_rent['population']*1000
join_rent['comp_ratio_1000_21 Collection Truck Noise'] = join_rent['21 Collection Truck Noise'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Banging/Pounding'] = join_rent['Banging/Pounding'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Car/Truck Horn'] = join_rent['Car/Truck Horn'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Car/Truck Music'] = join_rent['Car/Truck Music'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Engine Idling'] = join_rent['Engine Idling'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Flying Too Low'] = join_rent['Flying Too Low'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Horn Honking Sign Requested (NR9)'] = join_rent['Horn Honking Sign Requested (NR9)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Hovering'] = join_rent['Hovering'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Loud Music/Party'] = join_rent['Loud Music/Party'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Loud Talking'] = join_rent['Loud Talking'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Loud Television'] = join_rent['Loud Television'] / join_rent['population']*1000
join_rent['comp_ratio_1000_NYPD'] = join_rent['NYPD'] / join_rent['population']*1000
join_rent['comp_ratio_1000_News Gathering'] = join_rent['News Gathering'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise'] = join_rent['Noise'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise, Barking Dog (NR5)'] = join_rent['Noise, Barking Dog (NR5)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise, Ice Cream Truck (NR4)'] = join_rent['Noise, Ice Cream Truck (NR4)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise, Other Animals (NR6)'] = join_rent['Noise, Other Animals (NR6)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise:  lawn care equipment (NCL)'] = join_rent['Noise:  lawn care equipment (NCL)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Air Condition/Ventilation Equip, Commercial (NJ2)'] = join_rent['Noise: Air Condition/Ventilation Equip, Commercial (NJ2)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Air Condition/Ventilation Equip, Residential (NJ1)'] = join_rent['Noise: Air Condition/Ventilation Equip, Residential (NJ1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Alarms (NR3)'] = join_rent['Noise: Alarms (NR3)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Boat(Engine'] = join_rent['Noise: Boat(Engine'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Boat(Engine,Music,Etc) (NR10)'] = join_rent['Noise: Boat(Engine,Music,Etc) (NR10)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Construction Before/After Hours (NM1)'] = join_rent['Noise: Construction Before/After Hours (NM1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Construction Equipment (NC1)'] = join_rent['Noise: Construction Equipment (NC1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Jack Hammering (NC2)'] = join_rent['Noise: Jack Hammering (NC2)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Loud Music From Siebel System - For Dep Internal Use Only (NP21)'] = join_rent['Noise: Loud Music From Siebel System - For Dep Internal Use Only (NP21)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Loud Music/Daytime (Mark Date And Time) (NN1)'] = join_rent['Noise: Loud Music/Daytime (Mark Date And Time) (NN1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Loud Music/Nighttime(Mark Date And Time) (NP1)'] = join_rent['Noise: Loud Music/Nighttime(Mark Date And Time) (NP1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Manufacturing Noise (NK1)'] = join_rent['Noise: Manufacturing Noise (NK1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Other Noise Sources (Use Comments) (NZZ)'] = join_rent['Noise: Other Noise Sources (Use Comments) (NZZ)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Private Carting Noise (NQ1)'] = join_rent['Noise: Private Carting Noise (NQ1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: Vehicle (NR2)'] = join_rent['Noise: Vehicle (NR2)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Noise: air condition/ventilation equipment (NV1)'] = join_rent['Noise: air condition/ventilation equipment (NV1)'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Other'] = join_rent['Other'] / join_rent['population']*1000
join_rent['comp_ratio_1000_Passing By'] = join_rent['Passing By'] / join_rent['population']*1000
join_rent['comp_ratio_1000_People Created Noise'] = join_rent['People Created Noise'] / join_rent['population']*1000
print(join_rent.shape)
join_rent.head()

(9062, 101)


Unnamed: 0,zillow_zip,zillow_month_day,zillow_rent_value_index,zillow_month,zillow_year,incident_zip,month,year,borough,qty_complaints,...,comp_ratio_1000_Noise: Loud Music/Daytime (Mark Date And Time) (NN1),comp_ratio_1000_Noise: Loud Music/Nighttime(Mark Date And Time) (NP1),comp_ratio_1000_Noise: Manufacturing Noise (NK1),comp_ratio_1000_Noise: Other Noise Sources (Use Comments) (NZZ),comp_ratio_1000_Noise: Private Carting Noise (NQ1),comp_ratio_1000_Noise: Vehicle (NR2),comp_ratio_1000_Noise: air condition/ventilation equipment (NV1),comp_ratio_1000_Other,comp_ratio_1000_Passing By,comp_ratio_1000_People Created Noise
0,10001,2015-03-31,3997.445534,2015-03,2015,10001.0,2015-03,2015.0,MANHATTAN,213.0,...,0.0,0.0,0.0,0.0,0.297404,0.0,0.0,0.0,0.0,0.0
1,10001,2015-04-30,4027.333867,2015-04,2015,10001.0,2015-04,2015.0,MANHATTAN,217.0,...,0.0,0.0,0.0,0.0,0.637294,0.0,0.127459,0.042486,0.0,0.0
2,10001,2015-05-31,4085.82208,2015-05,2015,10001.0,2015-05,2015.0,MANHATTAN,233.0,...,0.0,0.0,0.0,0.0,0.33989,0.0,0.212431,0.042486,0.0,0.0
3,10001,2015-06-30,4131.364124,2015-06,2015,10001.0,2015-06,2015.0,MANHATTAN,218.0,...,0.0,0.0,0.0,0.0,0.169945,0.0,0.254918,0.0,0.0,0.0
4,10001,2015-07-31,4153.720059,2015-07,2015,10001.0,2015-07,2015.0,MANHATTAN,168.0,...,0.0,0.0,0.0,0.0,0.042486,0.0,0.084973,0.0,0.0,0.0


In [7]:
join_rent = join_rent.join(pd.get_dummies(join_rent['zillow_zip'], prefix = 'zip'))
print(join_rent.shape)
join_rent.head()

(9062, 243)


Unnamed: 0,zillow_zip,zillow_month_day,zillow_rent_value_index,zillow_month,zillow_year,incident_zip,month,year,borough,qty_complaints,...,zip_11375,zip_11377,zip_11378,zip_11385,zip_11415,zip_11417,zip_11418,zip_11420,zip_11432,zip_11435
0,10001,2015-03-31,3997.445534,2015-03,2015,10001.0,2015-03,2015.0,MANHATTAN,213.0,...,0,0,0,0,0,0,0,0,0,0
1,10001,2015-04-30,4027.333867,2015-04,2015,10001.0,2015-04,2015.0,MANHATTAN,217.0,...,0,0,0,0,0,0,0,0,0,0
2,10001,2015-05-31,4085.82208,2015-05,2015,10001.0,2015-05,2015.0,MANHATTAN,233.0,...,0,0,0,0,0,0,0,0,0,0
3,10001,2015-06-30,4131.364124,2015-06,2015,10001.0,2015-06,2015.0,MANHATTAN,218.0,...,0,0,0,0,0,0,0,0,0,0
4,10001,2015-07-31,4153.720059,2015-07,2015,10001.0,2015-07,2015.0,MANHATTAN,168.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
join_rent[join_rent['comp_ratio_1000_total'].isna()]

Unnamed: 0,zillow_zip,zillow_month_day,zillow_rent_value_index,zillow_month,zillow_year,incident_zip,month,year,borough,qty_complaints,...,zip_11375,zip_11377,zip_11378,zip_11385,zip_11415,zip_11417,zip_11418,zip_11420,zip_11432,zip_11435


In [9]:
join_rent.to_csv('join_rent_index_datasets.csv', index = False)