In [1]:
import pandas as pd
from pycaret.regression import *
from pycaret.regression import load_model, predict_model
import datetime
import os

In [2]:
# create folder if it does not exist
if not os.path.isdir('trained_models'):
    os.mkdir('trained_models')

In [3]:
# Load housing data
df_data = pd.read_csv("HousingRecommenderFinalDataset.csv")
df_data['date'] = pd.to_datetime(df_data['period_end'], format='%Y-%m-%d')

In [4]:
# Filter to retain All Residential data
df_res = df_data[df_data['property_type'] == 'All Residential']
df_res.reset_index(drop=True, inplace=True)

In [5]:
# sub-setting columns
columns = ['date', 'county', 'state_code', 'median_ppsf']
df_res = pd.DataFrame(df_res, columns=columns)

In [6]:
# split county and state
df_res['county_state'] = df_res[['county', 'state_code']].apply(lambda x: '-'.join(x), axis=1)
df_res.drop(['county', 'state_code'], axis=1, inplace=True)

In [7]:
# extract features from date
df_res['month'] = [i.month for i in df_res['date']]
df_res['year'] = [i.year for i in df_res['date']]

In [9]:
# get unique counties
counties = df_res['county_state'].unique()
    
all_results = []
final_model = {}

In [10]:
len(counties)

1860

In [70]:
for county in counties:
    
    df_county = df_res[df_res['county_state'] == county]
    
    # initialize setup
    s = setup(df_county, target = 'median_ppsf', train_size = 0.80,
              data_split_shuffle = False, fold_strategy = 'timeseries', fold = 3,
              ignore_features = ['date', 'county_state'],
              numeric_features = ['month', 'year'],
              categorical_features = ['month'],
              silent = True, verbose = False, session_id = 123)
    
    # get the best model using MAE
    best_model = compare_models(sort = 'MAE', verbose=False)
    
    # append best model to all_results
    p = pull().iloc[0:1]
    p['county_state'] = str(county)
    all_results.append(p)
    
    # fit the best model on the entire county data
    f = finalize_model(best_model)
    
    # add to final model
    final_model[county] = f
    
     # save model as pickle file 
    save_model(f, model_name='trained_models/' + str(county), verbose=False)

In [107]:
df_best_model = pd.concat(all_results,axis=0)

In [108]:
df_best_model

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),county_state
rf,Random Forest Regressor,3.6409,20.7104,4.5442,0.8960,0.0581,0.0469,0.2267,Morgan County-AL
gbr,Gradient Boosting Regressor,1.8069,5.9234,2.3252,0.9672,0.0246,0.0196,0.0400,Tulsa County-OK
lr,Linear Regression,18.9813,672.7615,25.2427,0.7624,0.0942,0.0747,0.0067,Hood River County-OR
omp,Orthogonal Matching Pursuit,27.3163,1564.9580,37.4556,0.0231,0.4271,0.4008,0.0167,Lamar County-MS
et,Extra Trees Regressor,3.3706,73.4086,6.1989,0.9254,0.0497,0.0296,0.1667,Pasco County-FL
...,...,...,...,...,...,...,...,...,...
ada,AdaBoost Regressor,41.0765,2380.7367,41.0765,,0.3824,0.2998,0.0633,Unicoi County-TN
et,Extra Trees Regressor,19.3259,600.3481,19.3259,,0.4292,0.5966,0.4633,Chester County-TN
et,Extra Trees Regressor,18.0819,511.2798,18.0819,,0.4397,0.3542,0.4133,Lanier County-GA
ada,AdaBoost Regressor,4.7259,43.4967,4.7259,,0.0585,0.0563,0.2300,Wayne County-MO


In [110]:
df_best_model.to_csv("best_model.csv", index = False)

In [88]:
future_dates = pd.date_range(start='2022-01-01', end = '2024-01-01', freq = 'M')
len(future_dates)

In [90]:
df_future = pd.DataFrame()
df_future['date'] = future_dates
df_future['month'] = [i.month for i in df_future['date']]
df_future['year'] = [i.year for i in df_future['date']]


In [92]:
# initialize predictions list
df_predict = []

# loop through counties, load model and call predict
for county in counties:
    if os.path.isfile('trained_models/' + str(county) + '.pkl'):
        l = load_model('trained_models/' + str(county), verbose=False)
        p = predict_model(l, data=df_future)
        p['time_series'] = county
        df_predict.append(p)

In [96]:
df_forecast = pd.concat(df_predict, axis=0)

In [97]:
df_forecast.shape

(43608, 5)

In [98]:
df_forecast.head()

Unnamed: 0,date,month,year,Label,time_series
0,2022-01-31,1,2022,100.563884,Morgan County-AL
1,2022-02-28,2,2022,98.752071,Morgan County-AL
2,2022-03-31,3,2022,99.923069,Morgan County-AL
3,2022-04-30,4,2022,104.868045,Morgan County-AL
4,2022-05-31,5,2022,108.512793,Morgan County-AL


In [99]:
df_forecast[['county', 'state']] = df_forecast['time_series'].str.split('-', 1, expand=True)
df_forecast = df_forecast.drop(columns=['time_series'])
df_forecast = df_forecast.rename(columns={"Label": "median_ppsf"})

df_forecast.head()

Unnamed: 0,date,month,year,median_ppsf,county,state
0,2022-01-31,1,2022,100.563884,Morgan County,AL
1,2022-02-28,2,2022,98.752071,Morgan County,AL
2,2022-03-31,3,2022,99.923069,Morgan County,AL
3,2022-04-30,4,2022,104.868045,Morgan County,AL
4,2022-05-31,5,2022,108.512793,Morgan County,AL


In [105]:
df_forecast.to_csv("ppsf_forecast.csv", index = False)