In [104]:
import pandas as pd
import os
import numpy as np
import string

In [105]:
def search_homes(filename, date, keyword):
    """
    Method that searches through the Data_dump dict structure
    
    Returns:
        vals: dict where keys include keyword and values are counts
        home_count: # of unique homes where the keyword is present
    """
    lennar = pd.read_json(filename)
    clean = lennar.drop_duplicates(subset=['Availability', 'Price', 'Beds', 'Baths', 'Sqft', 'Address', 'Community', 'URL'],
                    keep='first',
                    ignore_index=True)
    
    keyword = keyword.lower()
    vals = dict()
    home_count = 0
    index = 0
    
    for d in clean.Data_Dump:
        index += 1
        lines = [item.lower() for sublist in d.values() for item in sublist]
    
        # get rid of characters we don't want
        cleaned_lines = [l.strip().replace('\n', '').replace('\r', '').replace('®', '').
                         replace('™','').replace('¹', '').replace('”','') for l in lines]

        # remove punc other than dash
        ultra_cleaned_lines = [cl.translate(str.maketrans('', '', string.punctuation.replace('-',''))) 
                               for cl in cleaned_lines]

        cleaned_words = [i for l in ultra_cleaned_lines for i in l.split()]
        
        if keyword in cleaned_words:
            home_count += 1
    
    return home_count

In [106]:
def market_analysis(date, keyword):
    json_files = []
    path = f'./data-{date}/'
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith('.json'):
                json_files.append(root+'/'+name)
                
    df = pd.DataFrame([], columns = ['county', 'state', f'n_homes_{date}', f'{keyword}_homes_{date}'])
    for jf in json_files:
        tmp = pd.read_json(jf)
        state = jf.split('/')[2]
        county = jf.split('/')[3].split('-final')[0]
        n = search_homes(jf, date, keyword)
        
        if '.ipynb_checkpoints' in county:
            continue

        df.loc[len(df)] = [county, state, tmp.shape[0], n]
    
    return df

In [107]:
df1 = market_analysis('06-2023', 'ring')

In [108]:
df1.shape

(70, 4)

In [109]:
df2 = market_analysis('10-2022', 'ring')

In [110]:
df2.shape

(67, 4)

In [111]:
counties = df1.merge(df2, on=['county', 'state'], how='outer')

In [112]:
counties

Unnamed: 0,county,state,n_homes_06-2023,ring_homes_06-2023,n_homes_10-2022,ring_homes_10-2022
0,raleigh,north-carolina,177.0,157.0,319.0,285.0
1,wilmington,north-carolina,110.0,0.0,103.0,8.0
2,charlotte,north-carolina,242.0,6.0,199.0,171.0
3,greensboro,north-carolina,3.0,3.0,,
4,chicago,illinois,185.0,185.0,192.0,141.0
...,...,...,...,...,...,...
73,inland-northwest,washington,,,23.0,23.0
74,seattle,washington,,,175.0,166.0
75,jefferson-county,west-virginia,,,24.0,5.0
76,madison,wisconsin,,,27.0,0.0


In [113]:
states = counties.groupby('state').sum()

In [114]:
states

Unnamed: 0_level_0,n_homes_06-2023,ring_homes_06-2023,n_homes_10-2022,ring_homes_10-2022
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
alabama,301.0,39.0,175.0,0.0
arizona,319.0,259.0,343.0,334.0
california,1516.0,1016.0,1281.0,1104.0
colorado,218.0,0.0,277.0,251.0
delaware,148.0,88.0,183.0,180.0
florida,1974.0,1054.0,2109.0,1000.0
georgia,93.0,54.0,105.0,91.0
idaho,74.0,6.0,22.0,19.0
illinois,185.0,185.0,192.0,141.0
indiana,346.0,328.0,296.0,260.0


In [115]:
# % changes and save
def percentage_change(col1,col2):
    return round(((col2 - col1) / col1) * 100, 2)

In [117]:
counties['pct-change-N-homes'] = percentage_change(counties['n_homes_10-2022'],counties['n_homes_06-2023'])
counties['pct-change-usb'] = percentage_change(counties['ring_homes_10-2022'],counties['ring_homes_06-2023'])

In [118]:
counties.to_csv('./analysis/ring-by-county.csv', index=False)

In [119]:
states['pct-change-N-homes'] = percentage_change(states['n_homes_10-2022'],states['n_homes_06-2023'])    
states['pct-change-usb'] = percentage_change(states['ring_homes_10-2022'],states['ring_homes_06-2023'])
states.to_csv('./analysis/ring-by-state.csv')