In [1]:
# Imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random as random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
#initialize raw dataframe that i can refer back to if needed
df = pd.read_csv('./datasets/train.csv')
ames_test = pd.read_csv('./datasets/test.csv')

#create copy that will be written over
ames = df.copy()
ames

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1587,921126030,20,RL,79.0,11449,Pave,,IR1,HLS,...,0,0,,,,0,1,2008,WD,298751
2047,785,905377130,30,RL,,12342,Pave,,IR1,Lvl,...,0,0,,,,0,3,2009,WD,82500
2048,916,909253010,50,RL,57.0,7558,Pave,,Reg,Bnk,...,0,0,,,,0,3,2009,WD,177000
2049,639,535179160,20,RL,80.0,10400,Pave,,Reg,Lvl,...,0,0,,,,0,11,2009,WD,144000


In [3]:
#apply snake_case to all columns
ames.columns = [col.lower().replace(' ', '_') for col in ames.columns]
ames_test.columns = [col.lower().replace(' ', '_') for col in ames_test.columns]
ames.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
#only keep numeric features
ames = ames._get_numeric_data()
ames_test = ames_test._get_numeric_data()

ames.shape

(2051, 39)

In [5]:
ames.select_dtypes(include='float64')

Unnamed: 0,lot_frontage,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,bsmt_full_bath,bsmt_half_bath,garage_yr_blt,garage_cars,garage_area
0,,289.0,533.0,0.0,192.0,725.0,0.0,0.0,1976.0,2.0,475.0
1,43.0,132.0,637.0,0.0,276.0,913.0,1.0,0.0,1997.0,2.0,559.0
2,68.0,0.0,731.0,0.0,326.0,1057.0,1.0,0.0,1953.0,1.0,246.0
3,73.0,0.0,0.0,0.0,384.0,384.0,0.0,0.0,2007.0,2.0,400.0
4,82.0,0.0,0.0,0.0,676.0,676.0,0.0,0.0,1957.0,2.0,484.0
...,...,...,...,...,...,...,...,...,...,...,...
2046,79.0,0.0,1011.0,0.0,873.0,1884.0,1.0,0.0,2007.0,2.0,520.0
2047,,0.0,262.0,0.0,599.0,861.0,0.0,0.0,1961.0,2.0,539.0
2048,57.0,0.0,0.0,0.0,896.0,896.0,0.0,0.0,1929.0,2.0,342.0
2049,80.0,0.0,155.0,750.0,295.0,1200.0,1.0,0.0,1956.0,1.0,294.0


### Data cleaning: origins

In [6]:
ames.shape, ames_test.shape

((2051, 39), (878, 38))

In [7]:
# find column that's not in test set
set(ames.columns) - set(ames_test.columns)

{'saleprice'}

In [8]:
#check train nulls
ames.isnull().sum().sort_values(ascending=False).loc[lambda x: x>0]

lot_frontage      330
garage_yr_blt     114
mas_vnr_area       22
bsmt_half_bath      2
bsmt_full_bath      2
bsmtfin_sf_1        1
garage_cars         1
garage_area         1
total_bsmt_sf       1
bsmt_unf_sf         1
bsmtfin_sf_2        1
dtype: int64

In [9]:
#check test nulls
ames_test.isnull().sum().sort_values(ascending=False).loc[lambda x: x>0]

lot_frontage     160
garage_yr_blt     45
mas_vnr_area       1
dtype: int64

In [10]:
#drop all nulls from train
ames.dropna(axis = 1, inplace = True)

In [11]:
ames_test = ames_test[[col for col in ames.columns if col != 'saleprice']]

In [12]:
ames.shape, ames_test.shape

((2051, 28), (878, 27))

### Preprocessing: terrible threes

In [13]:
ames.head(1)

Unnamed: 0,id,pid,ms_subclass,lot_area,overall_qual,overall_cond,year_built,year_remod/add,1st_flr_sf,2nd_flr_sf,...,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,saleprice
0,109,533352170,60,13517,6,8,1976,2005,725,754,...,0,44,0,0,0,0,0,3,2010,130500


In [14]:
#set up X and y
X = ames.drop(columns = ['pid', 'saleprice'])
y = ames['saleprice']

ames_test.drop(columns = 'pid', inplace = True)

In [15]:
#split data into training and validation

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 13)

In [16]:
#check shapes for sanity
X_train.shape, X_val.shape, ames_test.shape

((1640, 26), (411, 26), (878, 26))

### Feature Engineering

In [17]:
#stash away ID
tr_id = X_train['id']
val_id = X_val['id']

In [18]:
#instantiate polyfeat to simulate our FE process

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

In [19]:
X_train = poly.fit_transform(X_train.drop(columns='id'))
X_train = pd.DataFrame(X_train, columns=poly.get_feature_names_out())

In [20]:
X_val = poly.transform(X_val.drop(columns='id'))
X_val = pd.DataFrame(X_val, columns=poly.get_feature_names_out())

In [21]:
#permutations/symmetric + original
(25*24)/2 + 25

325.0

In [22]:
X_train.shape, X_val.shape

((1640, 325), (411, 325))

### More preprocessing!

In [23]:
ss = StandardScaler()

Xs_train = ss.fit_transform(X_train)
Xs_train = pd.DataFrame(Xs_train, columns=ss.get_feature_names_out())

In [24]:
Xs_val = pd.DataFrame(ss.transform(X_val), columns=ss.get_feature_names_out())

### Modeling & evaluation!

In [None]:
#create your baseline model first!

In [30]:
lr = LinearRegression()

lr.fit(Xs_train, y_train)

cross_val_score(lr, Xs_train, y_train).mean()

#score it twice!

lr.score(Xs_train, y_train), lr.score(Xs_val, y_val)

mean_squared_error(y_train, lr.predict(Xs_train))**0.5

mean_squared_error(y_val, lr.predict(Xs_val))**0.5

### How to combat overfitting
1. More data (expensive, not always in our control)
2. Feature selection (reduce complexity)
3. Regularization

### Ridge adjustment

In [36]:
ridge = RidgeCV(alphas = np.logspace(0,5,100))

#fit to scaled training data
ridge.fit(Xs_train, y_train)

ridge.alpha_

#score it twice!
ridge.score(Xs_train, y_train)

ridge.score(Xs_val,y_val)

mean_squared_error(y_train, ridge.predict(Xs_train))**0.5

mean_squared_error(y_val, ridge.predict(Xs_val))**0.5

### Lasso adjustment

In [43]:
import warnings

#instantiate it!
lasso = LassoCV(alphas = np.arange(0.001, 10, 1))

#fit it!
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lasso.fit(Xs_train, y_train)

#what's our best alpha?
lasso.alpha_

#score it twice!
lasso.score(Xs_train, y_train)

lasso.score(Xs_val, y_val)

mean_squared_error(y_train, lasso.predict(Xs_train))**0.5

mean_squared_error(y_val, lasso.predict(Xs_val))**0.5

In [49]:
#take the time to look at coefficients, see which ones were zeroed out
used_coeff = [(round(coeff,2), Xs_train.columns[index]) for index, coeff in enumerate(lasso.coef_) if coeff>0]
print(len(used_coeff))
used_coeff

125


[(8474.46, 'lot_area'),
 (62525.76, 'overall_qual'),
 (9160.11, 'year_built'),
 (7546.91, '1st_flr_sf'),
 (3535.17, 'gr_liv_area'),
 (8535.39, 'bedroom_abvgr'),
 (1244.22, 'wood_deck_sf'),
 (237.22, '3ssn_porch'),
 (5984.8, 'ms_subclass lot_area'),
 (3084.91, 'ms_subclass full_bath'),
 (4226.66, 'ms_subclass half_bath'),
 (2537.63, 'ms_subclass bedroom_abvgr'),
 (11356.78, 'ms_subclass kitchen_abvgr'),
 (439.1, 'ms_subclass fireplaces'),
 (344.16, 'ms_subclass open_porch_sf'),
 (145.19, 'ms_subclass screen_porch'),
 (12147.13, 'lot_area overall_qual'),
 (23605.16, 'lot_area overall_cond'),
 (24383.9, 'lot_area totrms_abvgrd'),
 (5983.12, 'lot_area open_porch_sf'),
 (4350.57, 'lot_area screen_porch'),
 (27163.24, 'overall_qual year_built'),
 (93365.85, 'overall_qual 1st_flr_sf'),
 (41530.85, 'overall_qual 2nd_flr_sf'),
 (3588.51, 'overall_qual gr_liv_area'),
 (6151.99, 'overall_qual full_bath'),
 (32144.64, 'overall_qual totrms_abvgrd'),
 (9044.4, 'overall_qual fireplaces'),
 (0.36, 'ov

### at last: prediction time on test set

In [63]:
ames_test.head(1)

#stash id column
test_id = ames_test['id']

#feature engineering simulation
ames_test = poly.transform(ames_test.drop(columns='id'))

#standard scale and recreate dataframe
Xs_test = pd.DataFrame(ss.transform(ames_test), columns = poly.get_feature_names_out())

Xs_test

lasso_sub = Lasso(alpha = lasso.alpha_)

lasso_sub.fit(Xs_train, y_train)

In [70]:
#make predictions
preds = lasso_sub.predict(Xs_test)

preds.shape

preds = pd.DataFrame(preds, columns = ['SalePrice'])

preds.insert(loc=0, column='id', value = test_id)

preds

#save submission csv and drop index
preds.to_csv('./lasso_1_simple.csv', index = False)