In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

from urllib.request import urlopen
import json
import plotly.express as px
from IPython.display import IFrame

# DATA

Data Dictionary: 
* https://www2.census.gov/programs-surveys/cbp/technical-documentation/records-layouts/2017_record_layouts/zip_code_industry_detail_record_layout_2017.txt

NAICS Code Meanings:
* https://www.naics.com/six-digit-naics/

In [None]:
file_path = '../../src/data/temp/processed_zbp_data.csv'
data = pd.read_csv(file_path)
data.head()

# MODEL

Lit Review Variables: Firm Age, **Business Size**, **Industry**, **Location**, Legal

Goal: Forecast ESTAB for Zipcodes given **Business Size**, **Industry**, **Location** and past **ESTAB**

Steps:
* Rebuild ingestion pipeline to accept multi-year datasets
* Build heatmap of establishments by zipcode
    * https://medium.com/@mm.fuenteslopez/using-plotly-express-to-make-zip-code-level-choropleth-maps-a8ac8212b7ed
* Build baseline model
* Research how to apply actual timeseries models, LSTM?

Notes: 
* Assumes Monotonically increasing growth
* Time Series Predictions: https://www.advancinganalytics.co.uk/blog/2021/06/22/10-incredibly-useful-time-series-forecasting-algorithms
    * Training one lin-reg for each zipcode-size-ind combination may suffice for a baseline?
* Since data at county level is much more prevelant, may be better to look at distribution of jobs across all of california rather than just SD County. (may be skewed by large counties such as San Bernardino?)

## BASELINE - LINREG ENSEMBLE

break df by zip/naics (and n's)

train one linreg for each combo, using years as X and est as Y

In [None]:
zip_codes = data['zip'].unique()
naics_codes = data['naics'].unique()

In [None]:
%time

# train models

reg_ensemble = {}

for zc in zip_codes:
    reg_ensemble[zc] = {}
    for nc in naics_codes:
        # filter data
        temp = data[data['zip']==zc]
        temp = temp[temp['naics']==nc]
        # check if data exists
        if temp.shape[0] == 0:
            continue
        X = temp[['year']]
        y = temp['est']
        lr = LinearRegression().fit(X, y)
        reg_ensemble[zc][nc] = lr

In [None]:
# compute predict

def forecast_year(year):
    forecasts = []
    for zc in zip_codes:
        for nc in naics_codes:
            if (zc in reg_ensemble) and (nc in reg_ensemble[zc]):
                lr = reg_ensemble[zc][nc]
                pred = lr.predict(pd.DataFrame([year], columns=['year']))[0]
                forecasts += [[zc, nc, pred]]
    return forecasts

In [None]:
forecast_2050 = forecast_year(2050)
forecast_2050 = pd.DataFrame(np.array(forecast_2050), columns=['zip', 'naics', 'est'])
forecast_2050.head()

In [None]:
def plot_forecasts(data, year):
    df = data.groupby('zip')['est'].sum().reset_index()[['zip','est']]
    
    with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/ca_california_zip_codes_geo.min.json') as response:
        zipcodes = json.load(response)

    fig = px.choropleth(df, 
                        geojson=zipcodes, 
                        locations='zip', 
                        color='est',
                        color_continuous_scale='blackbody_r',
                        range_color=(df['est'].min(),df['est'].max()),
                        featureidkey="properties.ZCTA5CE10",
                        scope="usa",
                        labels={'Final_Labels':'Cluster_Category'})
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.write_html(f'../../src/data/out/plots/zbp_forecast_plot_{year}.html')

    display(IFrame(src=f'../../src/data/out/plots/zbp_forecast_plot_{year}.html', width=1000, height=1000))

In [None]:
plot_forecasts(forecast_2050, 2050)