In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm

In [None]:
### zbp total with features data
file_path = '../../../src/data/temp/lagged_zbp_totals_with_features.csv'
data = pd.read_csv(file_path)

## Correlation Matrix

In [None]:
numeric_columns = data.select_dtypes(include=['number']).columns
selected_df = data[numeric_columns]

correlation_matrix = selected_df.corr()

In [None]:
correlation_matrix.head()

In [None]:
correlation_with_est = correlation_matrix['est'].sort_values(ascending=False)

top_5_features = correlation_with_est.head(6)[1:]  # Excluding 'est' 

print(top_5_features)

# DROP NON-NUMERICAL

In [None]:
non_numerical_cols = data.select_dtypes(exclude=['int64', 'float64']).columns
data = data.drop(columns=non_numerical_cols)
data.head(1)

# TRAIN-TEST SPLIT

In [None]:
end_year = 2020
data_train = data[data['year'] <= end_year]
data_test = data[data['year'] > end_year]

# STANDARDIZATION

In [None]:
train_mean = data_train.mean()
train_mean.loc['zip'] = 0

train_std = data_train.std()
train_std.loc['zip'] = 1

In [None]:
data_train = (data_train - train_mean) / train_std
data_train.head(1)

In [None]:
data_test = (data_test - train_mean) / train_std
data_test.head(1)

# MODEL

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')
pl = Pipeline(steps=[('preproc', preproc), ('lr', LinearRegression(n_jobs=-1))])

# TESTING

In [None]:
def unstandardize_series(ser, mean, std):
    return (ser*std)+mean

In [None]:
def fit_eval(model, data_train, data_test, included_feats):
    X_train = data_train[included_feats]
    y_train = data_train['est']
    X_test = data_test[included_feats]
    y_test = data_test['est']
    
    pl.fit(X_train, y_train)
    
    y_preds = pl.predict(X_train)
    inverted_y_train = unstandardize_series(y_train, train_mean['est'], train_std['est'])
    inverted_y_preds = unstandardize_series(y_preds, train_mean['est'], train_std['est'])
    train_rmse = mean_squared_error(inverted_y_train, inverted_y_preds, squared=False)
    
    y_preds = pl.predict(X_test)
    inverted_y_test = unstandardize_series(y_test, train_mean['est'], train_std['est'])
    inverted_y_preds = unstandardize_series(y_preds, train_mean['est'], train_std['est'])
    test_rmse = mean_squared_error(inverted_y_test, inverted_y_preds, squared=False)
    
    return pl, train_rmse, test_rmse

# Using All Features

In [None]:
pl, train_rmse, test_rmse = fit_eval(pl, data_train, data_test, data.columns.drop(['est']))
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

## Using the Top 5 Features From Correlation Matrix

In [None]:
top_5_features = correlation_with_est.head(6)[1:]
included_feats = top_5_features.index.append(pd.Index(['zip']))

pl, train_rmse, test_rmse = fit_eval(pl, data_train, data_test, included_feats)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

## Using the Top 10 Features

In [None]:
top_10_features = correlation_with_est.head(11)[1:]
included_feats = top_10_features.index.append(pd.Index(['zip']))

pl, train_rmse, test_rmse = fit_eval(pl, data_train, data_test, included_feats)
print('train_rmse: ', train_rmse)
print('test_rmse: ', test_rmse)

# Fixed Effect Model

In [None]:
# Standardize Data
std_data = (data-train_mean)/train_std
std_data = data

# Create dummy variables for each ZIP code
dummies = pd.get_dummies(std_data['zip'], drop_first=True).astype(int)

# Concatenate the dummy variables with the original data
data_panel = pd.concat([std_data.drop(columns=['zip']), dummies], axis=1)

X = data_panel.drop(columns=['est']) 
y = data_panel['est']  # Dependent variable predicting establishment growth

# Constant term to the independent variables
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

print(model.summary())

## Statistically Significant Values

- em with a p-value of 0.0000
- ap with a p-value of 0.006
- n1_4_pct with a p-value < 0.000
- n5_9_pct with a p-value < 0.000
- n10_19_pct with a p-value < 0.000
- n20_49_pct with a p-value < 0.000
- n50_99_pct with a p-value < 0.000

## Interesting Zip Code

### - 91910 p-value (0.744)
### - 91911 p-value (0.385)

This output indicates that the model has a high R-squared value **(0.998)**, suggesting that the independent variables explain a large portion of the variance in the dependent variable.

# Random Effect Model

In [None]:
X = data_panel.drop(columns=['est']) 
y = data_panel['est']  

X = sm.add_constant(X)

model = sm.regression.mixed_linear_model.MixedLM(y, X, groups=std_data['zip'])

# Specify the optimizer (e.g., 'nm' for Nelder-Mead)
optimizer = 'nm'

iterations = 1000

mixed_model_fit = model.fit(method=optimizer, maxiter=iterations)

print(mixed_model_fit.summary())

## Statistically Significant Values

- ap with p-value 0.006
- n1_4_pct with p-value 0.000
- n5_9_pct with p-value 0.000
- n10_19_pct with p-value 0.000
- n20_49_pct with p-value 0.000
- All zipcodes

## Summary 

**Significance of the individual-specific effects**: The p-values for the individual-specific effects in the fixed effects model are statistically significant, indicating that there is likely unobserved heterogeneity at the individual level that affects the outcome variable. Suggesting that there are **individual-specific** characteristics or factors that are important to consider and control for in your analysis.

**Adjusted R-squared**: The adjusted R-squared value for the fixed effects model is higher compared to the random effects model. This suggests that the fixed effects model explains a greater proportion of the variation in the outcome variable.

Overall **fixed effect model** seems better for the data