In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


## Zillow File Info for reference

### Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
Zillow Home Value Index (ZHVI) A measure of the typical home value and market changes across a given region and housing type

### Zip_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv 
Zillow Home Value Forecast (ZHVF)

### Metro_sales_count_now_uc_sfrcondo_month.csv 
Sales Count Nowcast is the estimated number of unique properties that sold during the month after accounting for the latency between when sales occur and when they are reported.

### Metro_invt_fs_uc_sfrcondo_sm_month.csv
FOR-SALE LISTINGS
- For-Sale Inventory: The count of unique listings that were active at any time in a given month.
- New Listings: Indicates how many new listings have come on the market in a given month.
- Newly Pending Listings: The count of listings that changed from for-sale to pending status on Zillow.com in a given time period.
- Median List Price: The median price at which homes across various geographies were listed.

### Metro_sales_count_now_uc_sfrcondo_month.csv
SALES
The Sales Count Nowcast is the estimated number of unique properties that sold during the month after accounting for the latency between when sales occur and when they are reported. Available only for the raw cut of all homes.
- Sale Price (median/mean): The price at which homes across various geographies were sold.
- Total Transaction Value: The total dollar value of all homes sold in a given period (mean sale price x sales count)
- Sale-to-List Ratio (mean/median): Ratio of sale vs. final list price.
- Percent of Sales Below/Above List: Share of sales where sale price below/above the final list price; excludes homes sold for exactly the list price.

### Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv
DAYS ON MARKET AND PRICE CUTS
- Days to Pending: How long it takes homes in a region to change to pending status on Zillow.com after first being shown as for sale. The reported figure indicates the number of days (mean or median) that it took for homes that went pending during the week being reported, to go pending. This differs from the old “Days on Zillow” metric in that it excludes the in-contract period before a home sells.
- Days to Close (mean/median): Number of days between the listing going pending and the sale date.
- Share of Listings With a Price Cut: The number of unique properties with a list price at the end of the month that’s less than the list price at the beginning of the month, divided by the number of unique properties with an active listing at some point during the month.
- Price Cuts: The mean and median price cut for listings in a given region during a given time period, expressed as both dollars ($) and as a percentage (%) of list price.
Note: “Weekly” means you can see the metric’s value on a weekly cadence. The CSVs are updated monthly.



## Read in Data

In [9]:
# read in all the files in the Data/zillow folder

# print working directory
print(os.getcwd())

for file in os.listdir('../Data/zillow'):
    df = pd.read_csv('../Data/zillow/' + file)
    # print the name of the file
    print(file)
    print(df.head())
    # how many rows and columns are in each file
    print(df.shape)
    # how many NaNs are in each file
    print(df.isna().sum())
    # how many unique values are in each file
    print(df.nunique())



/Users/bermo/github/capstone/real_estate_analysis/Notebooks
Metro_market_temp_index_uc_sfrcondo_month.csv
   RegionID  SizeRank       RegionName RegionType StateName  2018-01-31  \
0    102001         0    United States    country       NaN        49.0   
1    394913         1     New York, NY        msa        NY        52.0   
2    753899         2  Los Angeles, CA        msa        CA        68.0   
3    394463         3      Chicago, IL        msa        IL        48.0   
4    394514         4       Dallas, TX        msa        TX        56.0   

   2018-02-28  2018-03-31  2018-04-30  2018-05-31  ...  2023-11-30  \
0        50.0        52.0        54.0        55.0  ...        54.0   
1        52.0        55.0        57.0        56.0  ...        67.0   
2        66.0        65.0        66.0        66.0  ...        72.0   
3        49.0        51.0        52.0        50.0  ...        59.0   
4        56.0        58.0        60.0        60.0  ...        58.0   

   2023-12-31  2024-01

In [11]:
# the two files starting with "Zip" are the zip code aggregations, and the rest are metro level aggregations
# print the shape of each file
for file in os.listdir('../Data/zillow'):
    if file.startswith('Zip'):
        print(file, df.shape)
    else:
        print(file, df.shape)

# 

Zip_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv (928, 83)
Metro_market_temp_index_uc_sfrcondo_month.csv (928, 83)
Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv (928, 83)
Metro_sales_count_now_uc_sfrcondo_month.csv (928, 83)
Metro_new_con_sales_count_raw_uc_sfrcondo_month.csv (928, 83)
Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv (928, 83)
Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month (1).csv (928, 83)
Metro_invt_fs_uc_sfrcondo_sm_month.csv (928, 83)


In [21]:
# print head of Zip_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
df = pd.read_csv('../Data/zillow/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
print(df.head())

# print shape of df
print(df.shape)

# print info of df
print(df.info())

# how many unique values are in the "RegionID" column
print(df['RegionID'].nunique())

# how many unique values are in the "RegionName" column
print(df['RegionName'].nunique())

# how many unique values are in the "StateName" column
print(df['StateName'].nunique())

# count of rows for each value of "Metro"
print(df['Metro'].value_counts())

# count of unique values in the "Metro" column
print(df['Metro'].nunique())



   RegionID  SizeRank  RegionName RegionType StateName State      City  \
0     91982         1       77494        zip        TX    TX      Katy   
1     61148         2        8701        zip        NJ    NJ  Lakewood   
2     91940         3       77449        zip        TX    TX      Katy   
3     62080         4       11368        zip        NY    NY  New York   
4     91733         5       77084        zip        TX    TX   Houston   

                                   Metro        CountyName     2000-01-31  \
0   Houston-The Woodlands-Sugar Land, TX  Fort Bend County  212757.568407   
1  New York-Newark-Jersey City, NY-NJ-PA      Ocean County  139171.687163   
2   Houston-The Woodlands-Sugar Land, TX     Harris County  103736.197118   
3  New York-Newark-Jersey City, NY-NJ-PA     Queens County  147341.057200   
4   Houston-The Woodlands-Sugar Land, TX     Harris County  102296.421002   

   ...     2023-11-30     2023-12-31     2024-01-31     2024-02-29  \
0  ...  492924.222260 

In [18]:
# get info for Data/zillow/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month (1).csv
df = pd.read_csv('../Data/zillow/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month (1).csv')
print(df.head())

# print shape of df
print(df.shape)

# print info of df
print(df.info())



   RegionID  SizeRank       RegionName RegionType StateName     2000-01-31  \
0    102001         0    United States    country       NaN  122710.838539   
1    394913         1     New York, NY        msa        NY  218233.216724   
2    753899         2  Los Angeles, CA        msa        CA  222104.914025   
3    394463         3      Chicago, IL        msa        IL  153956.315498   
4    394514         4       Dallas, TX        msa        TX  126111.301852   

      2000-02-29     2000-03-31     2000-04-30     2000-05-31  ...  \
0  122926.726242  123194.458782  123768.692526  124429.566719  ...   
1  219160.584763  220096.541902  221993.153236  223957.159952  ...   
2  222931.426109  224032.332323  226222.765868  228618.413461  ...   
3  154098.834197  154370.263570  155045.512395  155857.755512  ...   
4  126167.472266  126232.045908  126399.736872  126620.832817  ...   

      2023-11-30     2023-12-31     2024-01-31     2024-02-29     2024-03-31  \
0  353806.242676  354242.71700

In [24]:
# merge the zhvi file and the Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv file
zip_zhvi = pd.read_csv('../Data/zillow/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
metro_doz = pd.read_csv('../Data/zillow/Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv')

# print head of zip_zhvi
print(zip_zhvi.head())

# print head of metro_doz
print(metro_doz.head())

# merge the two files on "RegionName" and "Metro", repeating the metro values for each zip code
merged_df = pd.merge(zip_zhvi, metro_doz, left_on='Metro', right_on='Metro', how='left')

# print head of merged_df
print(merged_df.head())

# print shape of merged_df
print(merged_df.shape)



   RegionID  SizeRank  RegionName RegionType StateName State      City  \
0     91982         1       77494        zip        TX    TX      Katy   
1     61148         2        8701        zip        NJ    NJ  Lakewood   
2     91940         3       77449        zip        TX    TX      Katy   
3     62080         4       11368        zip        NY    NY  New York   
4     91733         5       77084        zip        TX    TX   Houston   

                                   Metro        CountyName     2000-01-31  \
0   Houston-The Woodlands-Sugar Land, TX  Fort Bend County  212757.568407   
1  New York-Newark-Jersey City, NY-NJ-PA      Ocean County  139171.687163   
2   Houston-The Woodlands-Sugar Land, TX     Harris County  103736.197118   
3  New York-Newark-Jersey City, NY-NJ-PA     Queens County  147341.057200   
4   Houston-The Woodlands-Sugar Land, TX     Harris County  102296.421002   

   ...     2023-11-30     2023-12-31     2024-01-31     2024-02-29  \
0  ...  492924.222260 

KeyError: 'Metro'

In [31]:


# Load the datasets
# mean doz dataset
metro_doz = pd.read_csv('../Data/zillow/Metro_mean_doz_pending_uc_sfrcondo_sm_month.csv')
# zhvi dataset
zip_zhvi = pd.read_csv('../Data/zillow/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')

# print heads for each
print(metro_doz.head())
print(zip_zhvi.head())


# extract the metro area name from the "Metro" column of the zip_zhvi dataframe (convert to string)
zip_zhvi['Metro_Name'] = zip_zhvi['Metro'].astype(str).apply(lambda x: x.split('-')[0])

# get everything before the "," in the RegionName column of the metro_doz dataframe and name it "Metro_Name"
metro_doz['Metro_Name'] = metro_doz['RegionName'].apply(lambda x: x.split(',')[0])

# now merge the two datasets on "Metro_Name"
merged_df = pd.merge(zip_zhvi, metro_doz, on='Metro_Name', how='left')

# Display the merged result
print(merged_df.head())

# Save the merged dataframe to a new CSV
merged_df.to_csv('merged_zip_metro.csv', index=False)


   RegionID  SizeRank       RegionName RegionType StateName  2018-03-31  \
0    102001         0    United States    country       NaN        51.0   
1    394913         1     New York, NY        msa        NY        68.0   
2    753899         2  Los Angeles, CA        msa        CA        35.0   
3    394463         3      Chicago, IL        msa        IL        51.0   
4    394514         4       Dallas, TX        msa        TX        46.0   

   2018-04-30  2018-05-31  2018-06-30  2018-07-31  ...  2023-11-30  \
0        48.0        45.0        43.0        44.0  ...        39.0   
1        65.0        63.0        64.0        66.0  ...        57.0   
2        32.0        31.0        32.0        33.0  ...        33.0   
3        44.0        40.0        39.0        40.0  ...        30.0   
4        42.0        38.0        37.0        38.0  ...        40.0   

   2023-12-31  2024-01-31  2024-02-29  2024-03-31  2024-04-30  2024-05-31  \
0        44.0        49.0        52.0        49.0  

In [35]:
# show number of rows in merged_df
print(merged_df.shape)

# show number of unique values in "Metro_Name" column
print(merged_df['Metro_Name'].nunique())

# get number of rows for each Metro_Name
print(merged_df['Metro_Name'].value_counts())




(28220, 389)
922
Metro_Name
nan                 5690
New York             830
Washington           628
Portland             382
Chicago              376
                    ... 
Craig, CO              1
Deming, NM             1
Connersville, IN       1
Grants, NM             1
Pecos, TX              1
Name: count, Length: 922, dtype: int64
Empty DataFrame
Columns: [RegionID_x, SizeRank_x, RegionName_x, RegionType_x, StateName_x, State, City, Metro, CountyName, 2000-01-31, 2000-02-29, 2000-03-31, 2000-04-30, 2000-05-31, 2000-06-30, 2000-07-31, 2000-08-31, 2000-09-30, 2000-10-31, 2000-11-30, 2000-12-31, 2001-01-31, 2001-02-28, 2001-03-31, 2001-04-30, 2001-05-31, 2001-06-30, 2001-07-31, 2001-08-31, 2001-09-30, 2001-10-31, 2001-11-30, 2001-12-31, 2002-01-31, 2002-02-28, 2002-03-31, 2002-04-30, 2002-05-31, 2002-06-30, 2002-07-31, 2002-08-31, 2002-09-30, 2002-10-31, 2002-11-30, 2002-12-31, 2003-01-31, 2003-02-28, 2003-03-31, 2003-04-30, 2003-05-31, 2003-06-30, 2003-07-31, 2003-08-31, 2003-09

In [41]:
# merge the other metro datasets
metro_invt = pd.read_csv('../Data/zillow/Metro_invt_fs_uc_sfrcondo_sm_month.csv')
metro_sales = pd.read_csv('../Data/zillow/Metro_sales_count_now_uc_sfrcondo_month.csv')
metro_market_temp = pd.read_csv('../Data/zillow/Metro_market_temp_index_uc_sfrcondo_month.csv')
metro_new = pd.read_csv('../Data/zillow/Metro_new_con_sales_count_raw_uc_sfrcondo_month.csv')

# add suffix to the columns of each of the metro datasets
metro_invt.columns = [col + '_invt' for col in metro_invt.columns]
metro_sales.columns = [col + '_sales' for col in metro_sales.columns]
metro_market_temp.columns = [col + '_market_temp' for col in metro_market_temp.columns]
metro_new.columns = [col + '_new' for col in metro_new.columns]

# create metro_name column for each of the metro datasets
metro_invt['Metro_Name'] = metro_invt['RegionName_invt'].apply(lambda x: x.split(',')[0])
metro_sales['Metro_Name'] = metro_sales['RegionName_sales'].apply(lambda x: x.split(',')[0])
metro_market_temp['Metro_Name'] = metro_market_temp['RegionName_market_temp'].apply(lambda x: x.split(',')[0])
metro_new['Metro_Name'] = metro_new['RegionName_new'].apply(lambda x: x.split(',')[0])

# merge to the merged_df on "Metro_Name"
merged_df = pd.merge(merged_df, metro_invt, on='Metro_Name', how='left')
merged_df = pd.merge(merged_df, metro_sales, on='Metro_Name', how='left')
merged_df = pd.merge(merged_df, metro_market_temp, on='Metro_Name', how='left')
merged_df = pd.merge(merged_df, metro_new, on='Metro_Name', how='left')

# print head of merged_df
print(merged_df.head())

# print shape of merged_df
print(merged_df.shape)




   RegionID_x  SizeRank_x  RegionName_x RegionType_x StateName_x State  \
0       91982           1         77494          zip          TX    TX   
1       61148           2          8701          zip          NJ    NJ   
2       91940           3         77449          zip          TX    TX   
3       62080           4         11368          zip          NY    NY   
4       91733           5         77084          zip          TX    TX   

       City                                  Metro        CountyName  \
0      Katy   Houston-The Woodlands-Sugar Land, TX  Fort Bend County   
1  Lakewood  New York-Newark-Jersey City, NY-NJ-PA      Ocean County   
2      Katy   Houston-The Woodlands-Sugar Land, TX     Harris County   
3  New York  New York-Newark-Jersey City, NY-NJ-PA     Queens County   
4   Houston   Houston-The Woodlands-Sugar Land, TX     Harris County   

      2000-01-31  ...  2023-10-31_new  2023-11-30_new  2023-12-31_new  \
0  212757.568407  ...          1577.0          18