In [33]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
df = pd.read_csv('datasets/train_set.csv')

test_df = pd.read_csv('datasets/test_set.csv')

In [36]:
X = df.loc[:, 'month':]
y = df['cpi_pct']


X_test = test_df.loc[:, X.columns]

In [37]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [38]:
# first we gonna build 2 benchark Models
# the first is just gonna predict the mean of the test target variable
# the second is just gonna predict the previous month cpi

In [39]:
pci_mean = y.mean()

y_test_pred = pd.DataFrame(pci_mean, columns=['pred'], index=range(X_test.shape[0]))
y_test_pred.to_csv('predictions/benchmark1_mean.csv')

for the second model we already saved the prev cpi value as our test prediction:

`test_df[['cpi_lag1']].rename({'cpi_lag1': 'pred'}, axis=1).reset_index(drop=True).to_csv('predictions/benchmark2_prev.csv')`

In [40]:
# Shuffling lead to very high lambda values wich basicly meant moth models were just predicting mean (y intercept)
# X = X.sample(frac=1)

In [41]:
model = LinearRegression()

model.fit(X, y)

pred = model.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv('predictions/ols_pred.csv') 

In [42]:
model = Lasso(max_iter=3000)

params = {
    'alpha': np.logspace(-8, -0.5, 100).tolist()
}

cv = GridSearchCV(model, params, cv=10)
cv.fit(X, y)

print(cv.best_estimator_)

pred = cv.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv('predictions/lassoCV_pred.csv')

Lasso(alpha=7.305271542664449e-05, max_iter=3000)


In [43]:
model = Ridge()

params = {
    'alpha': np.linspace(1, 10000, 100).tolist()
}

cv = GridSearchCV(model, params, cv=10)
cv.fit(X, y)

print(cv.best_estimator_)

pred = cv.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv('predictions/ridgeCV_pred.csv')

Ridge(alpha=304.0)


In [44]:
# shuffle Train dataset for CV

In [45]:
model = Lasso(max_iter=3000)

df = pd.read_csv('datasets/train_set.csv').sample(frac=1)

X = df.loc[:, 'month':]
y = df['cpi_pct']

params = {
    'alpha': np.logspace(-8, -0.5, 100).tolist()
}

cv = GridSearchCV(model, params, cv=10)
cv.fit(X, y)

print(cv.best_estimator_)

pred = cv.predict(X_test)

pd.DataFrame(pred, index=X_test.index, columns=['pred']).to_csv('predictions/lassoCV_shuffled_pred.csv')

Lasso(alpha=8.697490026177834e-05, max_iter=3000)
