In [28]:
import pandas as pd
# Import test_train_split ridge regression grid search SGDRegressor Lasso LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression, SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [29]:
dataset = pd.read_csv('NPPE1_ModelBuilding3.csv')
X = dataset.drop(columns=['PRICE'])  # Replace 'target_variable' with the actual name of the target variable column
y = dataset["PRICE"]  # Replace 'target_variable' with the actual name of the target variable column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
ridge = Ridge(alpha=10, solver='saga', tol=1e-4, random_state=42)

ridge.fit(X_train, y_train)
print(ridge.score(X_test, y_test))  

0.9977279112380488


In [31]:
# What is the index of most important feature? Note the index starts from 0. Ignore the intercept for this question.
print(ridge.coef_.argmax())
# What is the index of least important feature? Note the index starts from 0. Ignore the intercept for this question.
print(ridge.coef_.argmin())

4
11


In [32]:
sgdregressor = SGDRegressor(random_state=42)

grid_search = GridSearchCV(estimator=sgdregressor, param_grid={'penalty': ['l1', 'l2'], 'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], 'tol':[1e-4, 1e-3, 1e-2, 1e-1]}, cv=5, n_jobs=-1, scoring='neg_mean_absolute_error') 
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)
print('mean_absolute_error:', mean_absolute_error(y_test, grid_search.predict(X_test)))

{'alpha': 0.1, 'penalty': 'l2', 'tol': 0.001}
-0.10648834925199839
mean_absolute_error: 0.12854261070389397


In [36]:
# Create a pipeline of the PCA() as transformer and Lasso as an estimator
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# Use GridSearchCV for tuning the hyperparameters of the created pipeline on training dataset.
	# Values of n_components for PCA to be [0.9, 0.95]
	# lasso alpha value to be taken as : [10, 1, 0.01, 0.001]
	# scoring : neg_mean_absolute_error.
	# Use 5 fold cross validation.
	# n_jobs = -1 (negative one) [it helps in using all the computational power to run this job]

pipeline = Pipeline([('pca', PCA()), ('lasso', Lasso())])
param_grid = {'pca__n_components': [0.9, 0.95], 'lasso__alpha': [10, 1, 0.01, 0.001]}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print("r2:", r2_score(y_test, grid_search.predict(X_test)))
# How much variance is explained (Eigen value) by the first principle component?
print(grid_search.best_estimator_.named_steps['pca'].explained_variance_ratio_[0])

{'lasso__alpha': 0.001, 'pca__n_components': 0.9}
r2: 0.03178743336842138
0.9831288803433309


In [37]:
# Create a pipeline of the PolynomialFeatures as transformer and Lasso as an estimator with the following parameters:
#   - For PolynomialFeatures:
#     - interaction_only = False
#     - degree = 2
#   - For Lasso:
#     - alpha = 1
#     - warm_start = True
#     - random state as 0

# Fit the pipeline on the training dataset and find the r2 score on the test dataset.
from sklearn.preprocessing import PolynomialFeatures
pipeline = Pipeline([('poly', PolynomialFeatures(interaction_only=False, degree=2)), ('lasso', Lasso(alpha=1, warm_start=True, random_state=0))])
pipeline.fit(X_train, y_train)
print("r2:", r2_score(y_test, pipeline.predict(X_test)))

r2: 0.6483691477206266


In [38]:
# If you eliminate 1 feature with recursive feature elimination, which feature will be eliminated? using linear regression
from sklearn.feature_selection import RFE

rfe = RFE(estimator=LinearRegression(), n_features_to_select=X_train.shape[1]-1)
rfe.fit(X_train, y_train)
print(rfe.support_)



[ True  True  True  True  True False  True  True  True  True  True  True
  True]
