In [49]:
# importing libraries
import pandas as pd
import boto3
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'CarPrice_Assignment.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
car_price = pd.read_csv(file_content_stream)
car_price.head()

# defining input and target variables
X = car_price[['wheelbase', 'enginesize', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']]
Y = car_price['price']

# splitting into train (80%) and test (20%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# performing LASSO as variable selector
# First we need to estimate lambda
lasso_cv = LassoCV(alphas = [0.001, 0.01, 0.1, 1, 10], normalize = True, cv = 5).fit(X_train, Y_train)

# extracting optimal alpha
cv_alpha = lasso_cv.alpha_
cv_alpha

# variable selection with LASSO
lasso_md = Lasso(alpha = cv_alpha, normalize = True).fit(X_train, Y_train)
lasso_md.coef_

# dropping highwaympg
X_train = X_train.drop(columns = ['highwaympg'], axis = 1)
X_test = X_test.drop(columns = ['highwaympg'], axis = 1)

# creating l2 normalization function
def l2_normalization(X):
    
    x_mean = np.mean(X)
    l2 = np.sqrt(sum(X**2))
    return (X - x_mean) / l2

# normalizing train and test dataset
X_train = X_train.apply(l2_normalization, axis = 1)
X_test = X_test.apply(l2_normalization, axis = 1)

# building linear regression model
lm_md = LinearRegression().fit(X_train, Y_train)

# predicting on test
md1_pred = lm_md.predict(X_test)

# computing MSE
mse1 = np.mean(np.power(Y_test - md1_pred, 2))
mse1



20306843.22707661