In [1]:
import os
import dotenv
import pandas as pd

dotenv.load_dotenv()
path = os.getenv("HOUSING_DATA_PATH")
print("Loading dataset from:", path)

Loading dataset from: /Users/jcw2016/.cache/kagglehub/datasets/shengkunwang/housets-dataset/versions/2


In [12]:
df = pd.read_csv(path + "/HouseTS.csv")

print("Column names:", df.columns.tolist())

dmv_multi_data_dir = os.path.join(path, "DMV_Multi_Data", "DMV_Multi_Data")
if os.path.isdir(dmv_multi_data_dir):
    photo_zipcodes = [
        name for name in os.listdir(dmv_multi_data_dir)
        if os.path.isdir(os.path.join(dmv_multi_data_dir, name))
    ]
else:
    photo_zipcodes = []

print("Photo zipcodes:", sorted(photo_zipcodes))
print(f"Number of zipcodes with photo data: {len(photo_zipcodes)}")

Column names: ['date', 'median_sale_price', 'median_list_price', 'median_ppsf', 'median_list_ppsf', 'homes_sold', 'pending_sales', 'new_listings', 'inventory', 'median_dom', 'avg_sale_to_list', 'sold_above_list', 'off_market_in_two_weeks', 'city', 'zipcode', 'year', 'bank', 'bus', 'hospital', 'mall', 'park', 'restaurant', 'school', 'station', 'supermarket', 'Total Population', 'Median Age', 'Per Capita Income', 'Total Families Below Poverty', 'Total Housing Units', 'Median Rent', 'Median Home Value', 'Total Labor Force', 'Unemployed Population', 'Total School Age Population', 'Total School Enrollment', 'Median Commute Time', 'price', 'city_full']
Photo zipcodes: ['20001', '20002', '20003', '20004', '20005', '20006', '20007', '20008', '20009', '20010', '20011', '20012', '20015', '20016', '20017', '20018', '20019', '20020', '20024', '20032', '20036', '20037', '20105', '20106', '20109', '20110', '20111', '20112', '20115', '20117', '20119', '20120', '20121', '20124', '20129', '20130', '201

In [16]:
# Create a set of all unique file names across all zipcode folders and print it
all_files = set()
for zipcode in photo_zipcodes:
    folder_path = os.path.join(dmv_multi_data_dir, zipcode)
    if os.path.isdir(folder_path):
        files = set(os.listdir(folder_path))
        all_files.update(files)
print("Unique file names across all zipcode folders:")
print(sorted(all_files))


Unique file names across all zipcode folders:
['2011.png', '2012.png', '2013.png', '2014.png', '2015.png', '2016.png', '2017.png', '2018.png', '2020.png', '2021.png', '2022.png', 'analysis.json', 'prices.json', 'test_year.txt']


In [21]:

print(f"Number of zipcodes with 2022.png: 11")
print("Zipcodes with test_year.txt: ['20851']")

from collections import Counter

# Counter for combinations of years across all zipcodes
combo_counter = Counter()

for zipcode in photo_zipcodes:
    folder_path = os.path.join(dmv_multi_data_dir, zipcode)
    if os.path.isdir(folder_path):
        files = os.listdir(folder_path)
        years = sorted(
            int(file.replace(".png", ""))
            for file in files
            if file.endswith(".png") and file.replace(".png", "").isdigit()
        )
        if years:
            combo_counter[tuple(years)] += 1

print("Counts of year combinations across all zipcodes (as tuples):")
for combo, count in combo_counter.most_common():
    print(f"{combo}: {count}")



Number of zipcodes with 2022.png: 11
Zipcodes with test_year.txt: ['20851']
Counts of year combinations across all zipcodes (as tuples):
(2011, 2012, 2014, 2016, 2018, 2021): 125
(2011, 2013, 2015, 2017, 2018, 2021): 104
(2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2021): 51
(2011, 2013, 2015, 2018, 2021): 16
(2011, 2014, 2016, 2018, 2020, 2022): 2
(2011, 2014, 2016, 2018, 2022): 2
(2011, 2013, 2015, 2018, 2021, 2022): 2
(2011, 2012, 2014, 2016, 2018, 2021, 2022): 2
(2011, 2012, 2014, 2016, 2018, 2020, 2021, 2022): 1
(2011, 2014, 2016, 2018, 2021, 2022): 1
(2011, 2013, 2014, 2015, 2016, 2018, 2021, 2022): 1
(2011, 2012, 2013, 2014, 2015, 2016, 2018, 2021): 1


In [22]:
df.head()

Unnamed: 0,date,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,median_dom,...,Total Housing Units,Median Rent,Median Home Value,Total Labor Force,Unemployed Population,Total School Age Population,Total School Enrollment,Median Commute Time,price,city_full
0,2012-03-31,46550.0,217450.0,31.813674,110.183666,14.0,23.0,44.0,64.0,59.5,...,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,200773.999557,Atlanta-Sandy Springs-Alpharetta
1,2012-04-30,61870.0,245000.0,40.723982,130.528256,22.0,29.0,56.0,69.0,89.5,...,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,202421.064584,Atlanta-Sandy Springs-Alpharetta
2,2012-05-31,125500.0,217450.0,63.913043,119.919216,24.0,40.0,63.0,60.0,144.5,...,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,202681.309539,Atlanta-Sandy Springs-Alpharetta
3,2012-06-30,153000.0,189900.0,81.59808,105.617353,34.0,46.0,50.0,57.0,126.0,...,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,202998.603897,Atlanta-Sandy Springs-Alpharetta
4,2012-07-31,165500.0,154000.0,81.59808,83.921175,39.0,49.0,42.0,50.0,80.0,...,2677.0,710.0,279500.0,3171.0,460.0,5408.0,5408.0,2492.0,203781.903446,Atlanta-Sandy Springs-Alpharetta
