In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
## zbp total with features data
file_path = '../../src/data/temp/zbp_totals_with_features.csv'
data = pd.read_csv(file_path)

## unemployment data
file_path = '../../src/data/temp/processed_unemployment_data.csv'
unemployment_data = pd.read_csv(file_path)

## Correlation Matrix

In [None]:
numeric_columns = data.select_dtypes(include=['number']).columns
selected_df = data[numeric_columns]

correlation_matrix = selected_df.corr()

In [None]:
correlation_matrix.head()

In [None]:
correlation_with_est = correlation_matrix['est'].sort_values(ascending=False)

top_5_features = correlation_with_est.head(6)[1:]  # Excluding 'est' 

print(top_5_features)

## Using the Features Used in rf_reg_model

In [None]:
end_year = 2020
data_train = data[data['year'] <= end_year]
data_test = data[data['year'] > end_year]
included_feats = ['zip', 'year', 'naics_11_pct', 'naics_21_pct', 'naics_22_pct', 'naics_23_pct',
                   'naics_31_pct', 'naics_42_pct', 'naics_44_pct', 'naics_48_pct',
                   'naics_51_pct', 'naics_52_pct', 'naics_53_pct', 'naics_54_pct',
                   'naics_55_pct', 'naics_56_pct', 'naics_61_pct', 'naics_62_pct',
                   'naics_71_pct', 'naics_72_pct', 'naics_81_pct', 'naics_99_pct',
                   'n1_4_pct', 'n5_9_pct', 'n10_19_pct', 'n20_49_pct', 'n50_99_pct',
                   'n100_249_pct', 'n250_499_pct', 'n500_999_pct', 'n1000_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

## Using the Top 5 Features From Correlation Matrix

In [None]:
included_feats = ['emp', 'ap', 'qp1', 'naics_51_pct', 'n10_19_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

## Using the Top 10 Features

In [None]:
correlation_with_est = correlation_matrix['est'].sort_values(ascending=False)

top_10_features = correlation_with_est.head(11)[1:]

print(top_10_features)

In [None]:
included_feats = ['emp', 'ap', 'qp1', 'naics_51_pct', 'n10_19_pct', 'n5_9_pct', 'naics_31_pct', 'n20_49_pct', 'naics_52_pct','naics_62_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')