In [75]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

In [76]:
import warnings
warnings.filterwarnings('ignore')

In [77]:
USE_PCA = True

In [78]:
if USE_PCA:
    pca_prefix = 'pca_'
    pca_sufix = '_pca'
else:
    pca_prefix = ''
    pca_sufix = ''

df = pd.read_csv(f'datasets/train_set{pca_sufix}.csv')
test_df = pd.read_csv(f'datasets/test_set{pca_sufix}.csv')

if USE_PCA:
    X = df.loc[:, 'pca_0':]
else:
    X = df.loc[:, 'month':]
    
y = df['cpi_pct']
X_shuffled = df.sample(frac=1).loc[:, X.columns]
X_test = test_df.loc[:, X.columns]

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [83]:
model = LinearRegression()

model.fit(X, y)

pred = model.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv(f'predictions/{pca_prefix}ols.csv') 

In [84]:
model = Lasso(max_iter=3000)

params = {
    'alpha': np.logspace(-8, -0.5, 100).tolist()
}

cv = GridSearchCV(model, params, cv=10)
cv.fit(X, y)

print(cv.best_estimator_)

pred = cv.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv(f'predictions/{pca_prefix}lasso_cv.csv')

Lasso(alpha=0.0001747528400007683, max_iter=3000)


In [85]:
model = Ridge()

params = {
    'alpha': np.linspace(1, 10000, 100).tolist()
}

cv = GridSearchCV(model, params, cv=10)
cv.fit(X, y)

print(cv.best_estimator_)

pred = cv.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv(f'predictions/{pca_prefix}ridge_cv.csv')

Ridge(alpha=304.0)


In [86]:
# shuffle Train dataset for CV

In [87]:
model = Lasso(max_iter=3000)

params = {
    'alpha': np.logspace(-8, -0.5, 100).tolist()
}

cv = GridSearchCV(model, params, cv=10)
cv.fit(X_shuffled, y)

print(cv.best_estimator_)

pred = cv.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv(f'predictions/{pca_prefix}lasso_cvshuffled.csv')

Lasso(alpha=0.0004977023564332114, max_iter=3000)


In [88]:
lasso_cv_coefs = pd.DataFrame(cv.best_estimator_.coef_, index=cv.feature_names_in_, columns=['coef'])
lasso_cv_coefs.to_csv('models/lasso_cv_coefs.csv')

# Coeficients chosen by our best model Shuffled Lasso CV

In [89]:
lasso_cv_coefs[lasso_cv_coefs.coef != 0]

Unnamed: 0,coef
