In [1]:
import pandas as pd
import geopandas as gpd
from pathlib import Path  
import folium
import branca
import json
import os
import stat

In [2]:
# 2023 DATA IMPORTS

# Function to get the absolute path and update file permissions
def get_absolute_path_and_update_permissions(relative_path):
    current_working_directory = os.getcwd()
    absolute_path = os.path.join(current_working_directory, relative_path)
    absolute_path = os.path.abspath(absolute_path)

    # Check if the file exists
    if not os.path.isfile(absolute_path):
        raise FileNotFoundError(f"File not found: {absolute_path}")

    # Change file permissions to make it readable
    os.chmod(absolute_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

    return absolute_path

# Get absolute paths and update permissions for both files
sales_file_path = get_absolute_path_and_update_permissions('../data/sales/zip_code_market_tracker.tsv000')
rentals_file_path = get_absolute_path_and_update_permissions('../data/rental/Zip_zori_sm_month.csv')

# Read the files using the absolute paths
sales = pd.read_csv(sales_file_path, sep='\t', header=0)
rentals = pd.read_csv(rentals_file_path, sep=',', header=0, converters={'RegionName': lambda x: x.zfill(5)})

In [3]:
# SALES DATA COLLECTION & CLEANING

# Take the data from just 2022
salesCleanedZip = sales[sales["period_begin"].str.contains("2022")]

# Clean up the zips
salesCleanedZip['region'] = sales['region'].str.extract('(\d+)')

# Simplify the dataframe, isolating the 'region' and 'median_sale_price'
# salesSimplified = salesCleanedZip[['region', 'median_sale_price']]
salesSimplified = salesCleanedZip.filter(items=['region','median_sale_price'])

# Isolate the 'region' and 'median_sale_price', then groups, and takes the mean of the zips
# salesByZip = salesSimplified.groupby(['region']).mean()
salesByZip = salesSimplified.groupby(['region']).median()

# Reset the index
# We might not need this if we use the .filter() dot-extension above.
salesByZip = salesByZip.reset_index()

# Rename the column 'region' to 'RegionName'
salesByZip = salesByZip.rename(columns={'region':'RegionName'})

# Rename the column 'median_sale_price' to 'CurrentSalesPrice'
salesByZip = salesByZip.rename(columns={'median_sale_price':'CurrentSalesPrice'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salesCleanedZip['region'] = sales['region'].str.extract('(\d+)')


In [4]:
# RENTAL DATA COLLECTION & CLEANING
t1 = rentals[['RegionName']].join(rentals.filter(regex='2022'))
t2 = t1.melt(id_vars='RegionName', var_name='Date', value_name='CurrentRentalPrice')
currentRentalPrices = t2.groupby('RegionName').median().reset_index()

In [5]:
# Ensuring that there aren't any duplicate ZIP codes in the rental table
booleanRentals = currentRentalPrices['RegionName'].duplicated().any()

In [14]:
# JOINING THE DATABASE, CLEANING, & CALCULATING RENT:SALES

combined = salesByZip.set_index('RegionName'
                                ).join(currentRentalPrices.set_index('RegionName'))
rentalsAndSales = combined.dropna()
rentalsAndSales['RentToSaleRatio'] = \
    rentalsAndSales['CurrentRentalPrice'] \
    / rentalsAndSales['CurrentSalesPrice']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rentalsAndSales['RentToSaleRatio'] = \


In [7]:
# FILTERING OUT THE OUTLIARS
rentalsAndSalesFiltered = rentalsAndSales[rentalsAndSales.RentToSaleRatio < .015]
rentalsAndSalesSorted = rentalsAndSalesFiltered.sort_values(by='RentToSaleRatio', ascending=False)

In [8]:
# EXPORT FOR MATHEMATICA IN 2 COLUMNS
# filepath = Path('../prototype/out.csv')  
# filepath.parent.mkdir(parents=True, exist_ok=True)  
# rentalsAndSalesSorted.loc[:,'RentToSaleRatio'][0:1800].to_csv(filepath)

In [10]:
# IMPORTING SHAPEFILES

shapefile = '../data/polygon/cb_2020_us_zcta520_500k.shp'
gdf = gpd.read_file(shapefile)

# A BIT OF DATA CLEANING
baseMap = rentalsAndSalesSorted.join(gdf.set_index('NAME20'
        )).dropna().sort_values('RegionName')
gdf1 = gpd.GeoDataFrame(baseMap, geometry='geometry')

# SETTING THE BASE MAP
m = folium.Map(location=[40.70, -98.94], zoom_start=4.0,
               tiles='CartoDB positron')
color_map = branca.colormap.LinearColormap(['red', 'green'],
        vmin=0.000, vmax=0.016)

# PLOTTING EACH POLYGON ON THE MAP
for (_, r) in gdf1.iterrows():
    shape_column = gpd.GeoSeries(r['geometry'
                                 ]).simplify(tolerance=0.001)
    color = color_map(r['RentToSaleRatio'])
    geo_j = shape_column.to_json()
    geo_j_json = json.loads(geo_j)
    geo_j_json['features'][0]['properties']['ratio'] = \
        r['RentToSaleRatio']
    geo_j = folium.GeoJson(data=geo_j_json, style_function=lambda x: {
            'fillColor': color_map(x['properties']['ratio']),
            'color': 'black',
            'weight': 0,
            'fillOpacity': 0.9,
            })
    folium.Popup(str('{:.2f}% <br> {} <br> ${:,.0f} <br> ${:,.0f} '.format(r['RentToSaleRatio'
                 ] * 100, str(r['GEOID20']).zfill(5),
                 r['CurrentSalesPrice'], r['CurrentRentalPrice'
                 ]))).add_to(geo_j)
    geo_j.add_to(m)
m.save('../web_build/index.html')


In [16]:
gdf1

Unnamed: 0_level_0,CurrentSalesPrice,CurrentRentalPrice,RentToSaleRatio,ZCTA5CE20,AFFGEOID20,GEOID20,LSAD20,ALAND20,AWATER20,geometry
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
01085,285000.00,1414.888748,0.004965,01085,860Z200US01085,01085,Z5,154690428,2940567,"POLYGON ((-72.85747 42.23004, -72.85786 42.234..."
01420,330000.00,1346.780971,0.004081,01420,860Z200US01420,01420,Z5,77850259,1793220,"POLYGON ((-71.86273 42.54866, -71.86251 42.552..."
01440,300000.00,1275.000000,0.004250,01440,860Z200US01440,01440,Z5,63474669,3252347,"POLYGON ((-72.05345 42.60543, -72.05437 42.608..."
01453,380000.00,1458.603833,0.003838,01453,860Z200US01453,01453,Z5,68269233,1190779,"POLYGON ((-71.84201 42.51463, -71.84006 42.514..."
01503,562500.00,2410.500000,0.004285,01503,860Z200US01503,01503,Z5,33598944,521374,"POLYGON ((-71.67652 42.39612, -71.66807 42.394..."
...,...,...,...,...,...,...,...,...,...,...
99508,312250.00,1415.893345,0.004534,99508,860Z200US99508,99508,Z5,18504540,211181,"POLYGON ((-149.86814 61.19785, -149.86816 61.2..."
99515,417000.00,1877.500000,0.004502,99515,860Z200US99515,99515,Z5,27386310,218720,"POLYGON ((-149.96401 61.12952, -149.96240 61.1..."
99517,350000.00,1270.805155,0.003631,99517,860Z200US99517,99517,Z5,8390841,319938,"POLYGON ((-149.97118 61.19143, -149.96774 61.1..."
99577,407500.00,2057.605861,0.005049,99577,860Z200US99577,99577,Z5,743777082,3107440,"POLYGON ((-149.61342 61.34739, -149.61329 61.3..."
