In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

In [3]:
### zbp total with features data
file_path = '../../../src/data/temp/lagged_zbp_totals_with_features.csv'
data = pd.read_csv(file_path)

## unemployment data
file_path = '../../../src/data/temp/processed_unemployment_data.csv'
unemployment_data = pd.read_csv(file_path)

## Correlation Matrix

In [4]:
numeric_columns = data.select_dtypes(include=['number']).columns
selected_df = data[numeric_columns]

correlation_matrix = selected_df.corr()

In [5]:
correlation_matrix.head()

Unnamed: 0.1,Unnamed: 0,zip,year,est,emp,qp1,ap,naics_11_pct,naics_21_pct,naics_22_pct,...,naics_99_pct,n1_4_pct,n5_9_pct,n10_19_pct,n20_49_pct,n50_99_pct,n100_249_pct,n250_499_pct,n500_999_pct,n1000_pct
Unnamed: 0,1.0,0.98413,-0.043554,-0.013928,0.053454,0.089344,0.086641,-0.027926,0.030705,-0.156828,...,0.062316,-0.112986,0.019142,-0.018853,0.06322,0.151037,0.093862,0.118209,0.079476,0.12921
zip,0.98413,1.0,-0.042535,0.028743,0.079081,0.101644,0.099606,-0.012732,0.031806,-0.16493,...,0.057043,-0.114769,0.022615,-0.012807,0.084957,0.15751,0.100023,0.110214,0.080705,0.120724
year,-0.043554,-0.042535,1.0,0.134253,0.102156,0.1048,0.102609,-0.069817,-0.047434,-0.098297,...,-0.045732,-0.146599,-0.165712,-0.12089,-0.113593,-0.148267,-0.094668,-0.097185,-0.083039,-0.101533
est,-0.013928,0.028743,0.134253,1.0,0.884509,0.711063,0.719291,-0.068727,-0.008544,-0.103182,...,-0.029179,-0.123372,0.196537,0.242382,0.215104,0.109025,0.176471,0.02273,0.135591,-0.068795
emp,0.053454,0.079081,0.102156,0.884509,1.0,0.920287,0.926252,-0.068618,-0.007644,-0.064477,...,-0.024947,-0.1695,0.152745,0.214355,0.228909,0.153432,0.229785,0.071001,0.202753,-0.03935


In [6]:
correlation_with_est = correlation_matrix['est'].sort_values(ascending=False)

top_5_features = correlation_with_est.head(6)[1:]  # Excluding 'est' 

print(top_5_features)

emp             0.884509
ap              0.719291
qp1             0.711063
naics_51_pct    0.298991
n10_19_pct      0.242382
Name: est, dtype: float64


## Using the Features Used in rf_reg_model

In [7]:
end_year = 2020
data_train = data[data['year'] <= end_year]
data_test = data[data['year'] > end_year]
included_feats = ['zip', 'year', 'naics_11_pct', 'naics_21_pct', 'naics_22_pct', 'naics_23_pct',
                   'naics_31_pct', 'naics_42_pct', 'naics_44_pct', 'naics_48_pct',
                   'naics_51_pct', 'naics_52_pct', 'naics_53_pct', 'naics_54_pct',
                   'naics_55_pct', 'naics_56_pct', 'naics_61_pct', 'naics_62_pct',
                   'naics_71_pct', 'naics_72_pct', 'naics_81_pct', 'naics_99_pct',
                   'n1_4_pct', 'n5_9_pct', 'n10_19_pct', 'n20_49_pct', 'n50_99_pct',
                   'n100_249_pct', 'n250_499_pct', 'n500_999_pct', 'n1000_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 352989.523452053


## Using the Top 5 Features From Correlation Matrix

In [9]:
included_feats = ['emp', 'ap', 'qp1', 'naics_51_pct', 'n10_19_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 99718.6215452039


## Using the Top 10 Features

In [10]:
correlation_with_est = correlation_matrix['est'].sort_values(ascending=False)

top_10_features = correlation_with_est.head(11)[1:]

print(top_10_features)

emp             0.884509
ap              0.719291
qp1             0.711063
naics_51_pct    0.298991
n10_19_pct      0.242382
n20_49_pct      0.215104
naics_31_pct    0.206088
n5_9_pct        0.196537
n100_249_pct    0.176471
naics_52_pct    0.172196
Name: est, dtype: float64


In [11]:
included_feats = ['emp', 'ap', 'qp1', 'naics_51_pct', 'n10_19_pct', 'n5_9_pct', 'naics_31_pct', 'n20_49_pct', 'naics_52_pct','naics_62_pct']
X_train = data_train[included_feats]
y_train = data_train['est']
X_test = data_test[included_feats]
y_test = data_test['est']

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 95162.83418524696


# Fixed Effect Model

In [14]:
# Create dummy variables for each ZIP code
dummies = pd.get_dummies(data['zip'], drop_first=True).astype(int)

# Concatenate the dummy variables with the original data
data_panel = pd.concat([data.drop(columns=['zip']), dummies], axis=1)

X = data_panel.drop(columns=['est', 'emp_nf', 'qp1_nf', 'ap_nf']) 
y = data_panel['est']  # Dependent variable predicting establishment growth

# Constant term to the independent variables
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    est   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                     3061.
Date:                Wed, 07 Feb 2024   Prob (F-statistic):               0.00
Time:                        13:59:30   Log-Likelihood:                -6531.5
No. Observations:                1346   AIC:                         1.348e+04
Df Residuals:                    1138   BIC:                         1.456e+04
Df Model:                         207                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         1.031e+04   7466.658      1.380   

## Statistically Significant Values

- em with a p-value of 0.0000
- ap with a p-value of 0.006
- n1_4_pct with a p-value < 0.000
- n5_9_pct with a p-value < 0.000
- n10_19_pct with a p-value < 0.000
- n20_49_pct with a p-value < 0.000
- n50_99_pct with a p-value < 0.000

## Interesting Zip Code

### - 91910 p-value (0.744)
### - 91911 p-value (0.385)

This output indicates that the model has a high R-squared value **(0.998)**, suggesting that the independent variables explain a large portion of the variance in the dependent variable.

# Random Effect Model

In [16]:
X = data_panel.drop(columns=['est', 'emp_nf', 'qp1_nf', 'ap_nf']) 
y = data_panel['est']  

X = sm.add_constant(X)

model = sm.regression.mixed_linear_model.MixedLM(y, X, groups=data['zip'])

# Specify the optimizer (e.g., 'nm' for Nelder-Mead)
optimizer = 'nm'

iterations = 1000

mixed_model_fit = model.fit(method=optimizer, maxiter=iterations)

print(mixed_model_fit.summary())

                     Mixed Linear Model Regression Results
Model:                     MixedLM        Dependent Variable:        est       
No. Observations:          1346           Method:                    REML      
No. Groups:                175            Scale:                     1136.6659 
Min. group size:           1              Log-Likelihood:            -5812.0706
Max. group size:           9              Converged:                 Yes       
Mean group size:           7.7                                                 
-------------------------------------------------------------------------------
               Coef.       Std.Err.     z    P>|z|     [0.025         0.975]   
-------------------------------------------------------------------------------
const        -41285.423 142617788.021 -0.000 1.000 -279567013.500 279484442.654
Unnamed: 0       17.414         3.945  4.414 0.000          9.681        25.147
year             -5.235         3.896 -1.344 0.179        -12

## Statistically Significant Values

- ap with p-value 0.006
- n1_4_pct with p-value 0.000
- n5_9_pct with p-value 0.000
- n10_19_pct with p-value 0.000
- n20_49_pct with p-value 0.000
- All zipcodes

## Summary 

**Significance of the individual-specific effects**: The p-values for the individual-specific effects in the fixed effects model are statistically significant, indicating that there is likely unobserved heterogeneity at the individual level that affects the outcome variable. Suggesting that there are **individual-specific** characteristics or factors that are important to consider and control for in your analysis.

**Adjusted R-squared**: The adjusted R-squared value for the fixed effects model is higher compared to the random effects model. This suggests that the fixed effects model explains a greater proportion of the variation in the outcome variable.

Overall **fixed effect model** seems better for the data