In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Number of units built in each census tract

In [2]:
total_units = pd.read_csv("./data/bayarea_total_units_JG.csv")
total_units.head(5)

Unnamed: 0,FIPS13,YearBuilt,TotalUnits
0,60013522021051,2001,1
1,60013522021051,2009,1
2,60013530012030,2000,1
3,60013530012041,2004,1
4,60013530012041,2009,1


In [3]:
total_units = total_units.rename(columns={'FIPS13': 'FIPS13', 'YearBuilt': 'year', 'TotalUnits': 'total_units'})

In [4]:
geoid_10 = total_units['FIPS13'].astype(str)
geoid_10 = geoid_10.str.slice(stop=10)
geoid_10.head(5)

0    6001352202
1    6001352202
2    6001353001
3    6001353001
4    6001353001
Name: FIPS13, dtype: object

In [5]:
total_units_geoid = total_units.drop(columns=['FIPS13'])
total_units_geoid['GEO_ID_10'] = geoid_10
total_units_geoid.head(5)

Unnamed: 0,year,total_units,GEO_ID_10
0,2001,1,6001352202
1,2009,1,6001352202
2,2000,1,6001353001
3,2004,1,6001353001
4,2009,1,6001353001


In [6]:
total_units_geoid['GEO_ID_10'] = total_units_geoid['GEO_ID_10'].str.zfill(11)
total_units_geoid.head(5)

Unnamed: 0,year,total_units,GEO_ID_10
0,2001,1,6001352202
1,2009,1,6001352202
2,2000,1,6001353001
3,2004,1,6001353001
4,2009,1,6001353001


Changed county for census_tract 35511 to 013 (Contra Costa County).

In [7]:
total_units_geoid.iloc[7, 2] = '0601335511'

In [8]:
total_units_by_tract_by_year = total_units_geoid.groupby(
        by=['GEO_ID_10','year'], 
        axis=0, 
        as_index=False) \
    .sum()
total_units_by_tract_by_year.head(5)

Unnamed: 0,GEO_ID_10,year,total_units
0,6001352202,2001,1
1,6001352202,2009,1
2,6001353001,2000,1
3,6001353001,2004,1
4,6001353001,2009,1


In [9]:
total_units_2010_2017 = total_units_by_tract_by_year[
                            (total_units_by_tract_by_year['year'] <= 2017) &
                            (total_units_by_tract_by_year['year'] >= 2010)]
total_units_2010_2017.head(5)

Unnamed: 0,GEO_ID_10,year,total_units
17,6001400100,2010,9
18,6001400100,2011,3
19,6001400100,2012,1
20,6001400100,2013,1
21,6001400100,2014,1


# Converting tracts to zip codes

In [10]:
tract_to_zip_list = []

for i in range(0, 8):
    year = f'201{i}'
    df = pd.read_excel(f'./data/TRACT_ZIP_12{year}.xlsx')
    df.columns = df.columns.str.lower()
    df['tract'] = df['tract'].astype(str).str.zfill(11)
    df['year'] = int(year)
    df = df.drop(columns=['bus_ratio', 'oth_ratio', 'tot_ratio'])
    df = df.rename(columns={'res_ratio': 'resident'})
    tract_to_zip_list.append(df)

In [11]:
tract_to_zip = pd.concat(tract_to_zip_list, axis=0)
tract_to_zip.head(5)

Unnamed: 0,tract,zip,resident,year
0,1001020100,36067,1.0,2010
1,1001020200,36008,0.027225,2010
2,1001020200,36067,0.972775,2010
3,1001020300,36067,1.0,2010
4,1001020400,36066,0.962221,2010


In [12]:
total_units_per_tract = pd.merge(total_units_2010_2019, 
                               tract_to_zip, 
                               left_on=['GEO_ID_10', 'year'], 
                               right_on=['tract', 'year'])
total_units_per_tract.head(5)

NameError: name 'total_units_2010_2019' is not defined

In [None]:
total_units_per_tract['units_built'] = total_units_per_tract['total_units'] * total_units_per_tract['resident']
total_units_per_tract.head(5)

In [None]:
total_units_per_tract[total_units_per_tract['year'] == 2019].shape

In [None]:
total_units_per_zip = total_units_per_tract[['year', 'zip', 'units_built']]
total_units_per_zip.head(5)

In [None]:
total_units_by_zip = total_units_per_zip.groupby(by=['zip', 'year'], axis=0, as_index=False).sum()
total_units_by_zip.head(5)

In [None]:
total_units_by_zip['units_built'] = total_units_by_zip['units_built'].apply(np.round).astype(int)
total_units_by_zip.head(5)

# Housing Prices

In [None]:
zillow = pd.read_csv('./data/zillow.csv')

In [None]:
zillow.head(5)

In [None]:
zillow = zillow.drop(columns=['RegionID', 
                              'SizeRank', 
                              'RegionType', 
                              'StateName', 
                              'State', 
                              'City', 
                              'Metro', 
                              'CountyName'])
zillow.head(5)

In [None]:
zillow_q4 = zillow[[zillow.columns[0]] + [col for col in zillow.columns if '-12-' in col]]
zillow_q4.head(5)

In [None]:
zillow_2010_2019 = zillow_q4[[zillow.columns[0]] + 
                             [col for col in zillow_q4.columns if '2010' <= col[:4] < '2020']]
zillow_2010_2019.head(5)

In [None]:
zillow_years = []

for i in range(10):
    year = f'201{i}'
    df = zillow_2010_2019[['RegionName', f'{year}-12-31']]
    df = df.rename(columns={'RegionName': 'zip', f'{year}-12-31': 'price'})
    df['year'] = int(year)
    zillow_years.append(df)

zillow_by_year = pd.concat(zillow_years, axis=0)
zillow_by_year.head(5)

# Housing Prices with Housing Units built

In [None]:
data = pd.merge(total_units_by_zip, zillow_by_year, left_on=['zip', 'year'], right_on=['zip', 'year'])
data

In [None]:
data_2010 = data[data['year'] == 2010]
data_2010.head(5)

In [None]:
data_2010.to_csv('./data-out/data_2010.csv')

# Housing Units Per Year

In [None]:
data[['year', 'units_built']].groupby(by=['year']).sum()

In [None]:
data[data['year'] == 2018].shape

In [None]:
data[data['year'] == 2010].shape