In [39]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from urllib.request import urlopen
import json
import plotly.express as px
from IPython.display import IFrame

# DATA

Data Dictionary: 
* https://www2.census.gov/programs-surveys/cbp/technical-documentation/records-layouts/2017_record_layouts/zip_code_industry_detail_record_layout_2017.txt

NAICS Code Meanings:
* https://www.naics.com/six-digit-naics/

In [2]:
file_path = '../../src/data/temp/processed_zbp_data.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,index,zip,naics,est,n1_4,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,year
0,0,91901,23,88,68,7,6,5,1,0,1,0,0,2012
1,1,91901,31,2,1,1,0,0,0,0,0,0,0,2012
2,2,91901,42,9,9,0,0,0,0,0,0,0,0,2012
3,3,91901,44,67,25,21,16,4,1,0,0,0,0,2012
4,4,91901,48,8,8,0,0,0,0,0,0,0,0,2012


# MODEL

Lit Review Variables: Firm Age, **Business Size**, **Industry**, **Location**, Legal

Goal: Forecast ESTAB for Zipcodes given **Business Size**, **Industry**, **Location** and past **ESTAB**

Next-Steps:
* Research how to apply actual timeseries models, LSTM?

Notes: 
* Assumes Monotonically increasing growth
* Time Series Predictions: https://www.advancinganalytics.co.uk/blog/2021/06/22/10-incredibly-useful-time-series-forecasting-algorithms
    * Training one lin-reg for each zipcode-size-ind combination may suffice for a baseline?
* Since data at county level is much more prevelant, may be better to look at distribution of jobs across all of california rather than just SD County. (may be skewed by large counties such as San Bernardino?)

## BASELINE - LINREG ENSEMBLE

break df by zip/naics (and n's)

train one linreg for each combo, using years as X and est as Y

In [3]:
class ForecastModel():
    
    def __init__(self, data, year_to_forecast):
        self.data = data[data['year'] < year_to_forecast]
        self.year_to_forecast = year_to_forecast
        self.ensemble = None
        self.generated_forecasts = {}
        
    def train(self):
        
        self.zip_codes = data['zip'].unique()
        self.naics_codes = data['naics'].unique()
        
        reg_ensemble = {}

        for zc in self.zip_codes:
            reg_ensemble[zc] = {}
            for nc in self.naics_codes:
                # filter data
                temp = data[data['zip']==zc]
                temp = temp[temp['naics']==nc]
                # check if data exists
                if temp.shape[0] == 0:
                    continue
                X = temp[['year']]
                y = temp['est']
                lr = LinearRegression().fit(X, y)
                reg_ensemble[zc][nc] = lr
                
        self.ensemble = reg_ensemble
        
    def generate_forecast(self, year=None):
        
        if year is None:
            year = self.year_to_forecast
        
        forecasts = []
        for zc in self.zip_codes:
            for nc in self.naics_codes:
                if (zc in self.ensemble) and (nc in self.ensemble[zc]):
                    lr = self.ensemble[zc][nc]
                    pred = lr.predict(pd.DataFrame([year], columns=['year']))[0]
                    forecasts += [[zc, nc, pred]]
                    
        self.generated_forecasts[year] = pd.DataFrame(np.array(forecasts), columns=['zip', 'naics', 'est']).astype({'zip': 'int64', 'naics': 'int64'})
        
        def plot_forecasts(data, year):
            df = data.groupby('zip')['est'].sum().reset_index()[['zip','est']]
            with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/ca_california_zip_codes_geo.min.json') as response:
                zipcodes = json.load(response)
            fig = px.choropleth(df, 
                                geojson=zipcodes, 
                                locations='zip', 
                                color='est',
                                color_continuous_scale='blackbody_r',
                                range_color=(df['est'].min(),df['est'].max()),
                                featureidkey="properties.ZCTA5CE10",
                                scope="usa",
                                labels={'Final_Labels':'Cluster_Category'})
            fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
            fig.write_html(f'../../src/data/out/plots/zbp_forecast_plot_{year}.html')
            display(IFrame(src=f'../../src/data/out/plots/zbp_forecast_plot_{year}.html', width=1000, height=1000))
            
        plot_forecasts(self.generated_forecasts[year], year)
        
        return self.generated_forecasts[year]
    

### Run Model

In [15]:
model = ForecastModel(data, 2021)
model.train()
results = model.generate_forecast()
results.head()

Unnamed: 0,zip,naics,est
0,91901.0,23.0,109.909091
1,91901.0,31.0,5.636364
2,91901.0,42.0,2.055556
3,91901.0,44.0,60.436364
4,91901.0,48.0,13.527273


### Evaluate Model (MSE)

In [41]:
year_to_eval = 2021

preds = model.generated_forecasts[year_to_eval].groupby(['zip', 'naics']).max()
trues = data[data['year']==year_to_eval][['zip', 'naics', 'est']].groupby(['zip', 'naics']).max()
res = trues.merge(preds, how='inner', left_index=True, right_index=True, suffixes=('_trues', '_preds'))

mean_squared_error(res['est_trues'].reset_index(drop=True), res['est_preds'].reset_index(drop=True))

14.430784873430047