In [1]:
import warnings
warnings.filterwarnings('ignore')

# data imports
import pandas as pd
import numpy as np
from plotnine import *

# modeling imports
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV # Linear Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error #model evaluation
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut

# pipeline imports
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer

%matplotlib inline

# Train-test-split
## Let's predict the danceability of Beyonce songs
Load in Beyonce data

In [2]:
beyonce = pd.read_csv("https://raw.githubusercontent.com/katherinehansen2/CPSC392Hansen/main/data/Beyonce_data.csv")
beyonce.head()

Unnamed: 0.1,Unnamed: 0,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,track_name
0,1,Beyoncé,0.386,0.288,1,-18.513,1,0.0602,0.533,0.0167,0.141,0.399,43850,balance (mufasa interlude)
1,2,Beyoncé,0.484,0.363,5,-8.094,0,0.0368,0.645,0.0,0.125,0.201,226479,BIGGER
2,3,Beyoncé,0.537,0.247,2,-17.75,1,0.0793,0.199,1e-05,0.423,0.17,46566,the stars (mufasa interlude)
3,4,Beyoncé,0.672,0.696,4,-6.693,0,0.177,0.2,0.0275,0.0736,0.642,162353,FIND YOUR WAY BACK
4,5,Beyoncé,0.0,0.00515,9,-22.612,0,0.0,0.524,0.95,0.114,0.0,13853,uncle scar (scar interlude)


## Check for missing values

In [3]:
beyonce.isnull().sum()

## Split into X and y

In [4]:
# select predictors
predictors = ["energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence"]

X = beyonce[predictors]
y = beyonce["danceability"]

## Train test split

In [7]:
X_train, X_test, y_train, y_test = #

## Z-Score

In [9]:
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.#, columns=X.columns) # z-score X Train
X_test = pd.DataFrame(scaler.#, columns=X.columns) # z-score X test using same mu and sd as train

## Fit a LR Model

In [12]:
lr = LinearRegression()

lr.fit(X_train, y_train)

In [11]:
# predict for both train and test sets
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [14]:
# Assess

print("Train MSE : ", mean_squared_error(y_train,y_pred_train))
print("Train MAE : ", mean_absolute_error(y_train,y_pred_train))
print("Train R2  : ", r2_score(y_train,y_pred_train))
print()
print("Test MSE  : ", mean_squared_error(y_test,y_pred_test))
print("Test MAE  : ", mean_absolute_error(y_test,y_pred_test))
print("Test R2   : ", r2_score(y_test,y_pred_test))

## View Coefficients

In [16]:
intercept = lr.intercept_
coefficients = lr.coef_

# create DF of
params = pd.DataFrame({
    'Feature': ['Intercept'] + list(X_train.columns),
    'Coefficient': [intercept] + list(coefficients)
})
params

# Ridge/Lasso

In [18]:
amazon = pd.read_csv("https://raw.githubusercontent.com/katherinehansen2/CPSC392Hansen/main/data/amazon-books.txt", sep="\t")
amazon.dropna(inplace = True)
amazon.reset_index(inplace = True)
amazon.head()

Unnamed: 0,index,Title,Author,List Price,Amazon Price,Hard/ Paper,NumPages,Publisher,Pub year,ISBN-10,Height,Width,Thick,Weight (oz)
0,0,"1,001 Facts that Will Scare the S#*t Out of Yo...",Cary McNeal,12.95,5.18,P,304.0,Adams Media,2010.0,1605506249,7.8,5.5,0.8,11.2
1,1,21: Bringing Down the House - Movie Tie-In: Th...,Ben Mezrich,15.0,10.2,P,273.0,Free Press,2008.0,1416564195,8.4,5.5,0.7,7.2
2,2,100 Best-Loved Poems (Dover Thrift Editions),Smith,1.5,1.5,P,96.0,Dover Publications,1995.0,486285537,8.3,5.2,0.3,4.0
3,3,1421: The Year China Discovered America,Gavin Menzies,15.99,10.87,P,672.0,Harper Perennial,2008.0,61564893,8.8,6.0,1.6,28.8
4,4,1493: Uncovering the New World Columbus Created,Charles C. Mann,30.5,16.77,P,720.0,Knopf,2011.0,307265722,8.0,5.2,1.4,22.4


## Separate into X and y

In [19]:
predictors = ["List Price", "NumPages", "Weight (oz)", "Thick", "Height", "Width"]

X = amazon[predictors]
y = amazon["Amazon Price"]

## Train test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 7)

## Z-Score

In [21]:
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns) # z-score X Train
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns) # z-score X test using same mu and sd as train

## Fit a LASSO model

In [23]:
lasso = #

lasso.fit(X_train, y_train)

In [24]:
# predict

y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

In [26]:
# Evaluate

print("Train MSE : ", mean_squared_error(y_train,y_pred_train))
print("Train MAE : ", mean_absolute_error(y_train,y_pred_train))
print("Train R2  : ", r2_score(y_train,y_pred_train))
print()
print("Test MSE  : ", mean_squared_error(y_test,y_pred_test))
print("Test MAE  : ", mean_absolute_error(y_test,y_pred_test))
print("Test R2   : ", r2_score(y_test,y_pred_test))

## Checkout the coefficients

In [28]:
intercept = lasso.intercept_
coefficients = lasso.coef_

# create DF of
params = pd.DataFrame({
    'Feature': ['Intercept'] + list(X_train.columns),
    'Coefficient': [intercept] + list(coefficients)
})
params

## Now let's try with Ridge

In [30]:
ridge = #

ridge.fit(X_train, y_train)

In [31]:
# predict

y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

In [33]:
# Evaluate

print("Train MSE : ", mean_squared_error(y_train,y_pred_train))
print("Train MAE : ", mean_absolute_error(y_train,y_pred_train))
print("Train R2  : ", r2_score(y_train,y_pred_train))
print()
print("Test MSE  : ", mean_squared_error(y_test,y_pred_test))
print("Test MAE  : ", mean_absolute_error(y_test,y_pred_test))
print("Test R2   : ", r2_score(y_test,y_pred_test))

## Checkout the coefficients

In [35]:
intercept = ridge.intercept_
coefficients = ridge.coef_

# create DF of
params = pd.DataFrame({
    'Feature': ['Intercept'] + list(X_train.columns),
    'Coefficient': [intercept] + list(coefficients)
})
params