In [46]:
# importing libraries
import pandas as pd
import boto3
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'Fish.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
fish = pd.read_csv(file_content_stream)
fish.head()

# defining the input and target variable
X = fish[['Length1', 'Length2', 'Length3', 'Height', 'Width']]
Y = fish['Weight']

# splitting into train (80%) and test (20%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

# performing LASSO as variable selector
# First we need to estimate lambda
lasso_cv = LassoCV(normalize = True, cv = 5, max_iter = 10000).fit(X_train, Y_train)
lasso_alpha = lasso_cv.alpha_

# building lasso model
lasso_md = Lasso(alpha = lasso_alpha, normalize = True, max_iter = 10000).fit(X_train, Y_train)
lasso_md.coef_

# dropping variables
X_train = X_train.drop(columns = ['Length2', 'Length3'], axis = 1)
X_test = X_test.drop(columns = ['Length2', 'Length3'], axis = 1)

# creating l2 normalization function
def l2_normalization(X):
    
    x_mean = np.mean(X)
    l2 = np.sqrt(sum(X**2))
    return (X - x_mean) / l2

X_train = X_train.apply(l2_normalization, axis = 1)
X_test = X_test.apply(l2_normalization, axis = 1)

# linear regression
md1 = LinearRegression().fit(X_train, Y_train)

# preicting on test
md1_pred = md1.predict(X_test)
md1_pred

# computing the mse
mse1 = np.mean(np.power(md1_pred - Y_test, 2))
mse1

# ridge regression
ridge_cv = RidgeCV(alphas = [0.001, 0.01, 0.1, 1, 10, 100], cv = 5). fit(X_train, Y_train)
ridge_alpha = ridge_cv.alpha_

# Building the ridge model
ridge_md = Ridge(alpha = ridge_alpha).fit(X_train, Y_train)

# predicting on test
md2_pred = ridge_md.predict(X_test)
md2_pred

# computing the mse
mse2 = np.mean(np.power(md2_pred - Y_test, 2))
mse2

print('Lasso = ', mse1, ',','Ridge = ', mse2)

# We would use the Lasso model (model 1) because the mse is a smaller value

Lasso =  63410.43817928265 , Ridge =  68851.82257152862
