In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [80]:
# import the CC data
CC = pd.read_csv("../Data/CC_cleaned.csv")

# now realtor data
realtor = pd.read_csv("../Data/realtor/realtor_zip_monthly_historical.csv")

# print head
print(CC.head())

# print head
print(realtor.head())



  realtor = pd.read_csv("../Data/realtor/realtor_zip_monthly_historical.csv")


    ZIP  SOLDPRICE  SQFT  BEDS  BATHS  AGE  DOM      lat     long  zip_cluster
0  2474   410000.0  1368     2    2.0    3  897  42.4202 -71.1565       2474.0
1  2124   327000.0  1850     2    2.5    5  658  42.2918 -71.0717       2124.0
2  2184   331000.0  1469     2    1.0   38  467  42.2093 -70.9963       2184.0
3  1949   580000.0  2945     2    2.5    4  565  42.5942 -71.0130       1949.0
4  2139  1000000.0  2536     3    2.5    3   56  42.3647 -71.1042       2139.0
  month_date_yyyymm  postal_code            zip_name  median_listing_price  \
0            202408      32343.0          midway, fl              304950.0   
1            202408      62896.0  west frankfort, il              125000.0   
2            202408      56477.0          sebeka, mn              302400.0   
3            202408      25315.0      charleston, wv               65000.0   
4            202408      76454.0          gorman, tx              184500.0   

   median_listing_price_mm  median_listing_price_yy  acti

In [81]:
# filter the realtor data to just the zip codes in the CC data
realtor = realtor[realtor['postal_code'].isin(CC['ZIP'])]

# print head
print(realtor.head())



    month_date_yyyymm  postal_code       zip_name  median_listing_price  \
68             202408       2330.0     carver, ma              684999.0   
308            202408       2332.0    duxbury, ma             1669500.0   
320            202408       1983.0  topsfield, ma              949950.0   
322            202408       2199.0     boston, ma             2992500.0   
399            202408       1876.0  tewksbury, ma              702000.0   

     median_listing_price_mm  median_listing_price_yy  active_listing_count  \
68                   -0.0028                   0.1687                  11.0   
308                   0.0771                   0.0600                  21.0   
320                  -0.0318                  -0.0316                  10.0   
322                   0.0008                  -0.2443                   8.0   
399                   0.0801                   0.2028                  17.0   

     active_listing_count_mm  active_listing_count_yy  median_days_on_mark

In [82]:
# count unique zip codes
print(realtor['postal_code'].nunique())

# count unique zip codes
print(CC['ZIP'].nunique())




359
367


In [83]:
# filter to rows where month_date_yyyymm is 202408 or 201706
realtor = realtor[realtor['month_date_yyyymm'].isin([202408, 201706])]


# print head
print(realtor.head())





    month_date_yyyymm  postal_code       zip_name  median_listing_price  \
68             202408       2330.0     carver, ma              684999.0   
308            202408       2332.0    duxbury, ma             1669500.0   
320            202408       1983.0  topsfield, ma              949950.0   
322            202408       2199.0     boston, ma             2992500.0   
399            202408       1876.0  tewksbury, ma              702000.0   

     median_listing_price_mm  median_listing_price_yy  active_listing_count  \
68                   -0.0028                   0.1687                  11.0   
308                   0.0771                   0.0600                  21.0   
320                  -0.0318                  -0.0316                  10.0   
322                   0.0008                  -0.2443                   8.0   
399                   0.0801                   0.2028                  17.0   

     active_listing_count_mm  active_listing_count_yy  median_days_on_mark

In [84]:
# we want two dataframes, one with the average_listing_price and one with the median_days_on_market. each row should be a postal_code
# listing price df, with one row per postal_code and 202408 and 201706 as new columns, with the average_listing_price as the value
# First, let's handle duplicate entries by taking the mean of duplicate values
price_df = realtor.groupby(['postal_code', 'month_date_yyyymm'])['average_listing_price'].mean().unstack()
# rename date columns to have _price suffix
price_df.columns = [str(col) + '_price' for col in [202408, 201706]]


# Rename columns for clarity
price_df.columns.name = None
price_df = price_df.reset_index()
# make new column for the price multiplier
price_df['price_multiplier'] = price_df['202408_price'] / price_df['201706_price']

# do same for median_days_on_market
days_df = realtor.groupby(['postal_code', 'month_date_yyyymm'])['median_days_on_market'].mean().unstack()
# rename date columns to have _days suffix
days_df.columns = [str(col) + '_days' for col in [202408, 201706]]
days_df.columns.name = None
days_df = days_df.reset_index()
# make new column for the days_on_market multiplier
days_df['days_on_market_multiplier'] = days_df['202408_days'] / days_df['201706_days']

# get head of days_df
print(price_df.head())

# get head
print(days_df.head())

















   postal_code  202408_price  201706_price  price_multiplier
0       1083.0      274365.0      474850.0          0.577793
1       1331.0      220216.0      470785.0          0.467763
2       1373.0      341204.0      533957.0          0.639010
3       1420.0      208032.0      451581.0          0.460675
4       1430.0      298611.0      538866.0          0.554147
   postal_code  202408_days  201706_days  days_on_market_multiplier
0       1083.0         68.0         22.0                   3.090909
1       1331.0         51.0         27.0                   1.888889
2       1373.0         67.0         26.0                   2.576923
3       1420.0         37.0         39.0                   0.948718
4       1430.0         78.0         54.0                   1.444444


In [85]:
# add the multipliers to the CC dataframe based on ZIP by merging on CC[ZIP] to realtor postal_code
CC = CC.merge(price_df[['postal_code', 'price_multiplier']], left_on='ZIP', right_on='postal_code', how='left')
CC = CC.merge(days_df[['postal_code', 'days_on_market_multiplier']], left_on='ZIP', right_on='postal_code', how='left')

# only add the multipliers from price_df and days_df, and all the previous columns from CC
# ZIP, SOLD_PRICE, SQFT, BEDS, BATHS, AGE, DOM, lat, long, zip_cluster, price_multiplier, days_on_market_multiplier
CC = CC[['ZIP', 'SOLDPRICE', 'SQFT', 'BEDS', 'BATHS', 'AGE', 'DOM', 'lat', 'long', 'zip_cluster', 'price_multiplier', 'days_on_market_multiplier']]

# get head
print(CC.head())



















    ZIP  SOLDPRICE  SQFT  BEDS  BATHS  AGE  DOM      lat     long  \
0  2474   410000.0  1368     2    2.0    3  897  42.4202 -71.1565   
1  2124   327000.0  1850     2    2.5    5  658  42.2918 -71.0717   
2  2184   331000.0  1469     2    1.0   38  467  42.2093 -70.9963   
3  1949   580000.0  2945     2    2.5    4  565  42.5942 -71.0130   
4  2139  1000000.0  2536     3    2.5    3   56  42.3647 -71.1042   

   zip_cluster  price_multiplier  days_on_market_multiplier  
0       2474.0          0.492502                   0.352941  
1       2124.0          0.753723                   1.320000  
2       2184.0          0.660431                   0.435897  
3       1949.0          0.621947                   1.500000  
4       2139.0          0.662793                   0.240741  


In [86]:
# impute mean for the price_multiplier and days_on_market_multiplier
CC['price_multiplier'] = CC['price_multiplier'].fillna(CC['price_multiplier'].mean())
CC['days_on_market_multiplier'] = CC['days_on_market_multiplier'].fillna(CC['days_on_market_multiplier'].mean())

# get head
print(CC.head())





    ZIP  SOLDPRICE  SQFT  BEDS  BATHS  AGE  DOM      lat     long  \
0  2474   410000.0  1368     2    2.0    3  897  42.4202 -71.1565   
1  2124   327000.0  1850     2    2.5    5  658  42.2918 -71.0717   
2  2184   331000.0  1469     2    1.0   38  467  42.2093 -70.9963   
3  1949   580000.0  2945     2    2.5    4  565  42.5942 -71.0130   
4  2139  1000000.0  2536     3    2.5    3   56  42.3647 -71.1042   

   zip_cluster  price_multiplier  days_on_market_multiplier  
0       2474.0          0.492502                   0.352941  
1       2124.0          0.753723                   1.320000  
2       2184.0          0.660431                   0.435897  
3       1949.0          0.621947                   1.500000  
4       2139.0          0.662793                   0.240741  


In [87]:
# create a new column for adjusted_sold_price and adjusted_dom
CC['adjusted_sold_price'] = CC['SOLDPRICE'] * CC['price_multiplier']
CC['adjusted_dom'] = CC['DOM'] * CC['days_on_market_multiplier']

# get head
print(CC.head())






    ZIP  SOLDPRICE  SQFT  BEDS  BATHS  AGE  DOM      lat     long  \
0  2474   410000.0  1368     2    2.0    3  897  42.4202 -71.1565   
1  2124   327000.0  1850     2    2.5    5  658  42.2918 -71.0717   
2  2184   331000.0  1469     2    1.0   38  467  42.2093 -70.9963   
3  1949   580000.0  2945     2    2.5    4  565  42.5942 -71.0130   
4  2139  1000000.0  2536     3    2.5    3   56  42.3647 -71.1042   

   zip_cluster  price_multiplier  days_on_market_multiplier  \
0       2474.0          0.492502                   0.352941   
1       2124.0          0.753723                   1.320000   
2       2184.0          0.660431                   0.435897   
3       1949.0          0.621947                   1.500000   
4       2139.0          0.662793                   0.240741   

   adjusted_sold_price  adjusted_dom  
0        201925.858031    316.588235  
1        246467.513766    868.560000  
2        218602.614649    203.564103  
3        360729.317692    847.500000  
4        66

In [88]:
# write to csv
CC.to_csv("../Data/CC_realtor_merged_cleaned_adjusted.csv", index=False)

In [4]:
# show DOM over time for the entire dataset
plt.figure(figsize=(10, 6))
sns.lineplot(x='month_date_yyyymm', y='adjusted_dom', data=realtor)
plt.title('Median Days on Market over Time')
plt.xlabel('Month')
plt.ylabel('Median Days on Market')
plt.show()

NameError: name 'realtor' is not defined

<Figure size 1000x600 with 0 Axes>