In [4]:
import sys
sys.path.insert(0, '../../../src/helper')
import zbp_visualizer

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Data

In [7]:
file_path = '../../../src/data/temp/zbp_totals_with_features.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,zip,emp_nf,emp,qp1_nf,qp1,ap_nf,ap,est,year,naics_11_pct,...,n50_99_pct,n100_249_pct,n250_499_pct,n500_999_pct,n1000_pct,median_hh_income,total_population_x,total_population_y,total_retirement,total_midcareer
0,91901,H,4141,H,36304,H,174786,391,2012,0.0,...,0.01023,0.005115,0.002558,0.0,0.002558,76496,17034.0,17034.0,2691.0,3452.0
1,91902,G,2265,G,19111,G,81569,349,2012,0.002865,...,0.017192,0.002865,0.0,0.0,0.0,86099,17659.0,17659.0,3485.0,4031.0
2,91905,G,19,S,0,H,748,9,2012,0.0,...,0.0,0.0,0.0,0.0,0.0,70000,1088.0,1088.0,101.0,225.0
3,91906,D,0,D,0,D,0,27,2012,0.0,...,0.0,0.0,0.037037,0.0,0.0,54135,3679.0,3679.0,395.0,836.0
4,91910,G,19799,G,200767,G,805325,1458,2012,0.000686,...,0.021262,0.013032,0.002743,0.0,0.001372,55875,73761.0,73761.0,9195.0,19426.0


# Train-test Split

In [8]:
end_year = 2020
data_train = data[data['year'] <= end_year]
data_test = data[data['year'] > end_year]
included_feats = ['zip', 'year', 'naics_11_pct', 'naics_21_pct', 'naics_22_pct', 'naics_23_pct',
                   'naics_31_pct', 'naics_42_pct', 'naics_44_pct', 'naics_48_pct',
                   'naics_51_pct', 'naics_52_pct', 'naics_53_pct', 'naics_54_pct',
                   'naics_55_pct', 'naics_56_pct', 'naics_61_pct', 'naics_62_pct',
                   'naics_71_pct', 'naics_72_pct', 'naics_81_pct', 'naics_99_pct',
                   'n1_4_pct', 'n5_9_pct', 'n10_19_pct', 'n20_49_pct', 'n50_99_pct',
                   'n100_249_pct', 'n250_499_pct', 'n500_999_pct', 'n1000_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

# Run

In [9]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')
pl = Pipeline(steps=[('preproc', preproc), ('lr', LinearRegression())])
pl.fit(X_train, y_train)
mean_squared_error(y_test, pl.predict(X_test), squared=False)

64.3841117106912

# Visualize Predictions

In [11]:
# VISUALIZATION CODE NONFUNCTION FROM NOTEBOOK ONLY WORKS FROM RUN.PY
# MUST ADJUST METHOD TO TAKE IN OUTPATH TO FUNCTION HERE
 
# preds = X_test.copy()
# preds['est_pred'] = pl.predict(X_test)
# last_year = preds['year'].max()
# preds_last_year = preds[preds['year'] == last_year][['zip','est_pred']]
# zbp_visualizer.generate_zbp_chloropleth(preds_last_year, 'zip', 'est_pred', f'rf_reg_{last_year}_preds')