# Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Regularization

In [2]:
df = pd.read_csv('datasets/train_with_polynomials.csv')

In [3]:
df

Unnamed: 0,Overall Qual,Gr Liv Area,Total Bsmt SF,Garage Area,1st Flr SF,Garage Cars,Year Built,Garage Yr Blt,Year Remod/Add,Full Bath,...,Year Built Garage Yr Blt,Year Built Year Remod/Add,Year Built Full Bath,Garage Yr Blt^2,Garage Yr Blt Year Remod/Add,Garage Yr Blt Full Bath,Year Remod/Add^2,Year Remod/Add Full Bath,Full Bath^2,SalePrice
0,6.0,1479.0,725.0,475.0,725.0,2.0,1976.0,1976.0,2005.0,2.0,...,3904576.0,3961880.0,3952.0,3904576.0,3961880.0,3952.0,4020025.0,4010.0,4.0,130500
1,7.0,2122.0,913.0,559.0,913.0,2.0,1996.0,1997.0,1997.0,2.0,...,3986012.0,3986012.0,3992.0,3988009.0,3988009.0,3994.0,3988009.0,3994.0,4.0,220000
2,5.0,1057.0,1057.0,246.0,1057.0,1.0,1953.0,1953.0,2007.0,1.0,...,3814209.0,3919671.0,1953.0,3814209.0,3919671.0,1953.0,4028049.0,2007.0,1.0,109000
3,5.0,1444.0,384.0,400.0,744.0,2.0,2006.0,2007.0,2007.0,2.0,...,4026042.0,4026042.0,4012.0,4028049.0,4028049.0,4014.0,4028049.0,4014.0,4.0,174000
4,6.0,1445.0,676.0,484.0,831.0,2.0,1900.0,1957.0,1993.0,2.0,...,3718300.0,3786700.0,3800.0,3829849.0,3900301.0,3914.0,3972049.0,3986.0,4.0,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,8.0,1728.0,1884.0,520.0,1728.0,2.0,2007.0,2007.0,2007.0,2.0,...,4028049.0,4028049.0,4014.0,4028049.0,4028049.0,4014.0,4028049.0,4014.0,4.0,298751
2014,4.0,861.0,861.0,539.0,861.0,2.0,1940.0,1961.0,1950.0,1.0,...,3804340.0,3783000.0,1940.0,3845521.0,3823950.0,1961.0,3802500.0,1950.0,1.0,82500
2015,6.0,1913.0,896.0,342.0,1172.0,2.0,1928.0,1929.0,1950.0,1.0,...,3719112.0,3759600.0,1928.0,3721041.0,3761550.0,1929.0,3802500.0,1950.0,1.0,177000
2016,4.0,1200.0,1200.0,294.0,1200.0,1.0,1956.0,1956.0,1956.0,1.0,...,3825936.0,3825936.0,1956.0,3825936.0,3825936.0,1956.0,3825936.0,1956.0,1.0,144000


In [4]:
features = list(df.columns[:-1])

X = df[features]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# Scale data
ss = StandardScaler()
ss.fit(X_train)
Z_train = ss.transform(X_train)
Z_test = ss.transform(X_test)

In [6]:
# Instantiate and fit Lasso
lcv = LassoCV(max_iter=100_000)
lcv.fit(Z_train, y_train);

In [7]:
# Train score
print('train:', lcv.score(Z_train, y_train))

# Test score
print('test:', lcv.score(Z_test, y_test))

# Cross val score
print('cross val score:', cross_val_score(lcv, Z_train, y_train, cv=5).mean())

train: 0.8886237111772554
test: 0.8856583531854755
cross val score: 0.8786141815568392


This is by far the best model we've seen. Our training, test, and cross-validation scores are considerably higher with regularization. For comparison, here are the previous top scores from an MLR with interaction terms but no regularization.

- train: 0.8583114269993146
- test: 0.8589191189525888
- cross val score: 0.8549616952711011

# Looking at coefficients

Let's take a look at the coefficients of our model to see which of the explananory variables ended up being the most impactful.

In [8]:
pd.DataFrame({
    'variables': X.columns,
    'coefficients': lcv.coef_,
}).sort_values(by='coefficients', ascending=False).head(20)

Unnamed: 0,variables,coefficients
12,Overall Qual Total Bsmt SF,47805.999666
11,Overall Qual Gr Liv Area,35912.427759
57,Year Built Year Remod/Add,13953.070087
13,Overall Qual Garage Area,12940.150089
28,Gr Liv Area Full Bath,12714.631204
24,Gr Liv Area Garage Cars,4811.53119
62,Year Remod/Add^2,2531.453621
30,Total Bsmt SF Garage Area,2256.848051
14,Overall Qual 1st Flr SF,530.180513
15,Overall Qual Garage Cars,45.314123


# Bring in test data

In [9]:
test = pd.read_csv('datasets/test_with_polynomials.csv')

# Making predictions

In [10]:
# Make submissions directory if it doesn't already exist
try:
    os.mkdir('submissions')
except:
    pass

In [11]:
# Need to standardize test features
test_X = test[features]
test_Z = ss.transform(test_X)

In [12]:
test_preds = lcv.predict(test_Z)

test_preds_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': test_preds
})

test_preds_df.to_csv('submissions/regularization.csv', index=False)