In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error

In [25]:
inp = pd.read_csv("./BlogFeedback/blogData_train.csv",header=None)
test_inp = pd.read_csv("./BlogFeedback/blogData_test-2012.03.31.01_00.csv",header=None)

## Least Squares Regression

In [27]:
least_squares_reg = LinearRegression(normalize=True).fit(inp.iloc[:,:280], inp[[280]])

In [18]:
least_squares_reg_pred = least_squares_reg.predict(test_inp.iloc[:,:280])
print("RMSE with least squares regression",mean_squared_error(test_inp[[280]], least_squares_reg_pred,squared=False))

RMSE with least squares regression 40.3927510536605


## Ridge Regression

In [20]:
alphas = np.arange(0.01, 2.01, 0.01)

ridgecv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', normalize = True)
ridgecv.fit(inp.iloc[:,:280], inp[[280]])
print("Best value of alpha for ridge is ",ridgecv.alpha_)

Best value of alpha for ridge is  0.03


In [21]:
ridge_pred = ridgecv.predict(test_inp.iloc[:,:280])
print("RMSE with ridge regression",mean_squared_error(test_inp[[280]], ridge_pred,squared=False))

RMSE with ridge regression 40.51754461782588


## Lasso

In [28]:
lassocv = LassoCV(alphas = alphas, cv = 10, max_iter = 100000, normalize = True)
lassocv.fit(inp.iloc[:,:280], inp[[280]].values.flatten())
print("Best value of alpha for ridge is ",lassocv.alpha_)

Best value of alpha for ridge is  0.01


In [30]:
lasso_pred = lassocv.predict(test_inp.iloc[:,:280])
print("RMSE with ridge regression",mean_squared_error(test_inp[[280]], lasso_pred,squared=False))

RMSE with ridge regression 41.67205813354152


## Most important features according to LASSO

In [32]:
coeff_array = lassocv.coef_
imp_features_index = [index+1 for index in range(len(coeff_array)) if coeff_array[index]>0]
print("The important Columns according to Lasso are ",imp_features_index)

The important Columns according to Lasso are  [6, 10, 21, 52, 55]
