In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from ISLP import load_data
from scipy.stats import loguniform

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression, lasso_path, enet_path
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score


In [4]:
df_test=pd.read_csv('sfhousing_lasso_test.csv')
df_test.head()

Unnamed: 0.1,Unnamed: 0,price,br,lsqft,bsqft,county,street,price_log,lsqft_log,bsqft_log,county_code
0,21848,485000.0,5.0,7200.0,2526.0,Contra Costa County,5348 Rockrose Court,5.685742,3.857332,3.402433,1
1,35907,620000.0,3.0,3230.0,1513.0,Contra Costa County,533 Treyburn Circle,5.792392,3.509203,3.179839,1
2,63615,825000.0,5.0,14375.0,2479.0,Santa Clara County,855 Black Walnut Court,5.916454,4.157608,3.394277,6
3,27805,535000.0,5.0,10000.0,2796.0,Contra Costa County,3700 Longhorn Court,5.728354,4.0,3.446537,1
4,278,640000.0,4.0,5280.0,2286.0,Alameda County,211 Central Avenue,5.80618,3.722634,3.359076,0


In [5]:
df_train=pd.read_csv('sfhousing_lasso_train.csv')
df_train

Unnamed: 0.1,Unnamed: 0,price,br,lsqft,bsqft,county,street,price_log,lsqft_log,bsqft_log,county_code,split_0,split_1,split_2
0,68181,254000.0,2.0,9148.0,903.0,Santa Clara County,797 Delaware Avenue \#2,5.404834,3.961326,2.955688,6,train,test,train
1,41376,575000.0,3.0,7841.0,1629.0,Napa County,2730 Indiana Street,5.759668,3.894371,3.211921,3,test,train,train
2,31964,325000.0,3.0,7000.0,1048.0,Contra Costa County,399 Schooner Way,5.511883,3.845098,3.020361,1,test,train,train
3,4771,475000.0,3.0,6480.0,1036.0,Alameda County,4748 Bach Court,5.676694,3.811575,3.015360,0,train,test,train
4,59381,480000.0,3.0,5663.0,1421.0,Santa Clara County,420 Garfield Court,5.681241,3.753047,3.152594,6,test,train,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72817,6265,436000.0,1.0,4500.0,1316.0,Alameda County,1218 Grove Way,5.639486,3.653213,3.119256,0,train,test,train
72818,54886,620000.0,4.0,5227.0,1400.0,Santa Clara County,10708 Culbertson Drive,5.792392,3.718253,3.146128,6,train,train,test
72819,76820,400000.0,4.0,6098.0,1720.0,Solano County,1885 Buena Tierra Street,5.602060,3.785187,3.235528,7,test,train,train
72820,860,563000.0,3.0,5000.0,1495.0,Alameda County,548 Madison Street,5.750508,3.698970,3.174641,0,test,train,train


In [6]:
county_dummies_train= pd.get_dummies(df_train['county_code'], prefix='county_code')
df_train=pd.concat([df_train, county_dummies_train], axis=1)
df_train

Unnamed: 0.1,Unnamed: 0,price,br,lsqft,bsqft,county,street,price_log,lsqft_log,bsqft_log,...,split_2,county_code_0,county_code_1,county_code_2,county_code_3,county_code_4,county_code_5,county_code_6,county_code_7,county_code_8
0,68181,254000.0,2.0,9148.0,903.0,Santa Clara County,797 Delaware Avenue \#2,5.404834,3.961326,2.955688,...,train,0,0,0,0,0,0,1,0,0
1,41376,575000.0,3.0,7841.0,1629.0,Napa County,2730 Indiana Street,5.759668,3.894371,3.211921,...,train,0,0,0,1,0,0,0,0,0
2,31964,325000.0,3.0,7000.0,1048.0,Contra Costa County,399 Schooner Way,5.511883,3.845098,3.020361,...,train,0,1,0,0,0,0,0,0,0
3,4771,475000.0,3.0,6480.0,1036.0,Alameda County,4748 Bach Court,5.676694,3.811575,3.015360,...,train,1,0,0,0,0,0,0,0,0
4,59381,480000.0,3.0,5663.0,1421.0,Santa Clara County,420 Garfield Court,5.681241,3.753047,3.152594,...,train,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72817,6265,436000.0,1.0,4500.0,1316.0,Alameda County,1218 Grove Way,5.639486,3.653213,3.119256,...,train,1,0,0,0,0,0,0,0,0
72818,54886,620000.0,4.0,5227.0,1400.0,Santa Clara County,10708 Culbertson Drive,5.792392,3.718253,3.146128,...,test,0,0,0,0,0,0,1,0,0
72819,76820,400000.0,4.0,6098.0,1720.0,Solano County,1885 Buena Tierra Street,5.602060,3.785187,3.235528,...,train,0,0,0,0,0,0,0,1,0
72820,860,563000.0,3.0,5000.0,1495.0,Alameda County,548 Madison Street,5.750508,3.698970,3.174641,...,train,1,0,0,0,0,0,0,0,0


In [7]:
county_dummies_test= pd.get_dummies(df_test['county_code'], prefix='county_code')
df_test=pd.concat([df_test, county_dummies_test], axis=1)
df_test

Unnamed: 0.1,Unnamed: 0,price,br,lsqft,bsqft,county,street,price_log,lsqft_log,bsqft_log,county_code,county_code_0,county_code_1,county_code_2,county_code_3,county_code_4,county_code_5,county_code_6,county_code_7,county_code_8
0,21848,485000.0,5.0,7200.0,2526.0,Contra Costa County,5348 Rockrose Court,5.685742,3.857332,3.402433,1,0,1,0,0,0,0,0,0,0
1,35907,620000.0,3.0,3230.0,1513.0,Contra Costa County,533 Treyburn Circle,5.792392,3.509203,3.179839,1,0,1,0,0,0,0,0,0,0
2,63615,825000.0,5.0,14375.0,2479.0,Santa Clara County,855 Black Walnut Court,5.916454,4.157608,3.394277,6,0,0,0,0,0,0,1,0,0
3,27805,535000.0,5.0,10000.0,2796.0,Contra Costa County,3700 Longhorn Court,5.728354,4.000000,3.446537,1,0,1,0,0,0,0,0,0,0
4,278,640000.0,4.0,5280.0,2286.0,Alameda County,211 Central Avenue,5.806180,3.722634,3.359076,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18201,28065,560000.0,4.0,6100.0,1927.0,Contra Costa County,115 Catalpa Court,5.748188,3.785330,3.284882,1,0,1,0,0,0,0,0,0,0
18202,10409,453500.0,3.0,7000.0,1025.0,Alameda County,6463 Jasmine Avenue,5.656577,3.845098,3.010724,0,1,0,0,0,0,0,0,0,0
18203,64999,455000.0,3.0,5663.0,1380.0,Santa Clara County,1978 Doxey Drive,5.658011,3.753047,3.139879,6,0,0,0,0,0,0,1,0,0
18204,14398,530000.0,2.0,4300.0,1256.0,Alameda County,3664 Lily Street,5.724276,3.633468,3.098990,0,1,0,0,0,0,0,0,0,0



Data Loading:

Load the training dataset (df_train) containing the predictors (br, bsqft_log, lsqft_log), and the response variable (price_log).
Define Predictors and Response Variable:

Define the list of predictors (['br', 'bsqft_log', 'lsqft_log']) and the response variable ('price_log').
Define Values of 𝜆:

Define the list of lambda values ([0.01, 0.03, 0.1]), representing different levels of regularization.
Splitting Data into Folds:

Use KFold with 3 splits to split the training data into three folds for cross-validation.
Initialize Dictionary for MSE Values:

Initialize a dictionary (mse_values) to store the mean squared error (MSE) values for each lambda and split.
Loop Over Lambda Values:

For each value of 𝜆:
Loop Over Splits:
For each split generated by the cross-validator:
Split Data into Train and Validation Sets:
Extract the training and validation sets based on the current split indices.
Define Pipeline:
Create a Pipeline object consisting of two steps: standardization using StandardScaler and LASSO regression using Lasso with the current 𝜆 value.
Fit Pipeline on Training Data:
Fit the pipeline on the training data to standardize the predictors and perform LASSO regression.
Predict on Validation Set:
Use the fitted pipeline to predict the response variable on the validation set.
Compute MSE:
Calculate the mean squared error (MSE) between the true response values and the predicted values.
Store MSE:
Append the computed MSE to the list of MSE values for the current 𝜆.
Calculate Mean MSE Across Splits for Each 𝜆:

Compute the average MSE across all splits for each 𝜆 value.
Print Mean MSE Values:

Print the mean MSE values in a tabular format, including the MSE values for each split and the mean MSE across all splits for each 𝜆 value.


In [12]:

#
predictors=['br', 'bsqft_log', 'lsqft_log']
response=['price_log']

lambda_values=[0.01, 0.03, 0.1]

#splitting into 3 folds
kfold=KFold(n_splits=3, shuffle=True)

# to make dictionary to put every MSE for each lambda and split
mse_values={l: [] for l in lambda_values}

for l in lambda_values:
    #loop made on splits, it loops over each split generated by the Kfold cross-validator
    for train_index, val_index in kfold.split(df_train):
        #splitting training data into train and validation sets using the indices of train and validation
        train_set, val_set = df_train.iloc[train_index], df_train.iloc[val_index]
        
        #define pipeline consisting two steps
        pipeline=Pipeline([
            ('scaler', StandardScaler()),
            ('lasso', Lasso(alpha=l))
        ])
        
        #fitting pipeline on training data[train_set]
        pipeline.fit(train_set[predictors], train_set[response])
        
        #predicting response varibale on validation set
        val_predictions = pipeline.predict(val_set[predictors])
        
        #calulate mse
        mse=mean_squared_error(val_set[response], val_predictions)
        
        #add mse to list
        mse_values[l].append(mse)
        
#calculate mean MSE across splits for each lambda
mean_mse_values = {l: np.mean(mse_values[l]) for l in lambda_values}

# Print mean MSE values
print("lambda/split", "split_0", "split_1", "split_2", "mean", sep='\t')
for l in lambda_values:
    print(l, *mse_values[l], mean_mse_values[l], sep='\t')
        
        
        
        
        
    


lambda/split	split_0	split_1	split_2	mean
0.01	0.019003582498451587	0.019497745300712323	0.019086892541425128	0.019196073446863013
0.03	0.020229991972532793	0.020278819865406843	0.020592769802964372	0.020367193880301337
0.1	0.03028966752126605	0.029818765608898925	0.030054257232902302	0.03005423012102243


In [9]:
# Print mean MSE values
print("lambda/split", "split_0", "split_1", "split_2", "mean", sep='\t')
for l in lambda_values:
    mean_mse = "{:.3f}".format(mean_mse_values[l])  # Format mean MSE to three decimal places
    mse_values_formatted = [f"{mse:.3f}" for mse in mse_values[l]]  # Format MSE for each split to three decimal places
    print(l, *mse_values_formatted, mean_mse, sep='\t')


lambda/split	split_0	split_1	split_2	mean
0.01	0.019	0.019	0.019	0.019
0.03	0.020	0.021	0.021	0.020
0.1	0.030	0.031	0.029	0.030


In [10]:
#10
lasso=Lasso(alpha=0.01)

#fit lasso model using full training set
lasso.fit(df_train[predictors], df_train[response])

#getting coefficients for each predictor
coefficients= lasso.coef_


#finding coeffiecients for each predictor
print("Coefficients for each predictor:")
for predictor, coefficient in zip(predictors, coefficients):
    print(f"{predictor}: {coefficient:.3f}") #.3f formats in 3 decimals
    
    
# Identify the most important predictor
most_important_predictor = predictors[np.argmax(np.abs(coefficients))]
print(f"\nThe most important predictor in predicting price_log is: {most_important_predictor}")

Coefficients for each predictor:
br: 0.045
bsqft_log: 0.166
lsqft_log: 0.071

The most important predictor in predicting price_log is: bsqft_log


In [96]:
#11 Report the training [1pt] 
train_predictions = lasso.predict(df_train[predictors]) 
#this takes predictors variables as input to give target varibale i.e. price_log for the training dataset

train_mse= mean_squared_error(df_train[response], train_predictions)
#this calculates mse between the actual target values df_train[response] i.e. (price_log) and the predicted values train_predictions

train_mse



0.024289670244516743

In [95]:
#and the test [1pt] MSE.

test_predictions= lasso.predict(df_test[predictors])

test_mse=mean_squared_error(df_test[response], test_predictions)

test_mse

0.024110647855234986