In [14]:
import statsmodels.api as sm
import numpy as np
import pandas as pd

In [15]:
cpi_df = pd.read_csv('CPIAUCSL.csv')
unrate_df = pd.read_csv('UNRATE.csv')

cpi_df['DATE'] = pd.to_datetime(cpi_df['DATE'])
unrate_df['DATE'] = pd.to_datetime(unrate_df['DATE'])

merged_df = pd.merge(cpi_df, unrate_df, on='DATE', how='inner')

merged_df['CPI_RATE'] = merged_df['CPIAUCSL'].pct_change() * 100
merged_df = merged_df.dropna()

In [16]:
def identify_outliers_zscore(df, column, threshold=3):
    mean = df[column].mean()
    std = df[column].std()
    return df[np.abs((df[column] - mean) / std) > threshold].index

cpi_outliers = identify_outliers_zscore(merged_df, 'CPI_RATE')
unrate_outliers = identify_outliers_zscore(merged_df, 'UNRATE')

merged_df.loc[cpi_outliers, 'CPI_RATE'] = np.nan
merged_df.loc[unrate_outliers, 'UNRATE'] = np.nan

merged_df['CPI_RATE'] = merged_df['CPI_RATE'].interpolate()
merged_df['UNRATE'] = merged_df['UNRATE'].interpolate()

In [17]:
data = pd.DataFrame({
    'Y': merged_df['CPI_RATE'],
    'X': merged_df['UNRATE'],
    'X_lag1': merged_df['UNRATE'].shift(1),
    'X_lag2': merged_df['UNRATE'].shift(2)
}).dropna()

In [18]:
initial_model = sm.OLS(data['Y'], sm.add_constant(data[['X', 'X_lag1', 'X_lag2']])).fit()
rho = initial_model.resid.autocorr(lag=1)  # Estimate AR(1) coefficient for the residuals

In [19]:
#cochrane-Orcutt transformation with estimated rho
data['Y_adj'] = data['Y'].iloc[1:] - rho * data['Y'].iloc[:-1].values
data['X_adj'] = data['X'].iloc[1:] - rho * data['X'].iloc[:-1].values
data['X_lag1_adj'] = data['X_lag1'].iloc[1:] - rho * data['X_lag1'].iloc[:-1].values
data['X_lag2_adj'] = data['X_lag2'].iloc[1:] - rho * data['X_lag2'].iloc[:-1].values

In [20]:
transformed_data = data.dropna() 
gls_model = sm.OLS(
    transformed_data['Y_adj'],
    sm.add_constant(transformed_data[['X_adj', 'X_lag1_adj', 'X_lag2_adj']])
).fit()

print(gls_model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Y_adj   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.239
Date:                Sat, 02 Nov 2024   Prob (F-statistic):             0.0823
Time:                        02:16:00   Log-Likelihood:                 55.870
No. Observations:                 915   AIC:                            -103.7
Df Residuals:                     911   BIC:                            -84.46
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0798      0.026      3.012      0.0