In [48]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [25]:
# Read the data
ames = pd.read_csv("/Users/jakeaaknes/Downloads/AmesHousing (1).csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()
ames

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,120,0,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,0,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,7937,Pave,IR1,Lvl,AllPub,CulDSac,...,0,0,0,0,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,8885,Pave,IR1,Low,AllPub,Inside,...,0,0,0,0,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,10441,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,10010,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,4,2006,WD,Normal,170000


Consider four possible models for predicting house prices:

- Using only the size and number of rooms.
- Using size, number of rooms, and building type.
- Using size and building type, and their interaction.
- Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

**Model 1: Size and Number of Rooms**

In [83]:
# A model only using size and number of rooms
lr = LinearRegression()

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.25)

# Transforming columns
ct = ColumnTransformer(
    [('standardize', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])],
    remainder = 'drop'
)

# Creating a pipeline
lr_pipeline_1 = Pipeline(
[
    ('preprocessing', ct),
    ('linear_regression', LinearRegression())]
)

# Testing coefficients
lr_fitted = lr_pipeline_1.fit(X_train, y_train)
lr_fitted.named_steps['linear_regression'].coef_

# Getting predictions and rmse
y_pred_1 = lr_fitted.predict(X_test)
MSE_model_1 = mean_squared_error(y_test, y_pred_1,squared = False)
print('MSE - Size and # of Rooms:', round(MSE_model_1, 2))

MSE - Size and # of Rooms: 51770.58


**Model 2: Size, Number of Rooms, and Building Type**

In [84]:
# Transforming columns
ct = ColumnTransformer(
    [('dummify', OneHotEncoder(sparse_output = False), ['Bldg Type']),
     ('standardize', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])],
    remainder = 'drop'
)

# Creating a pipeline
lr_pipeline_2 = Pipeline(
[
    ('preprocessing', ct),
    ('linear_regression', LinearRegression())]
)

# Testing coefficients
lr_fitted_2 = lr_pipeline_2.fit(X_train, y_train)
lr_fitted_2.named_steps['linear_regression'].coef_

# Getting predictions and rmse
y_pred_2 = lr_fitted_2.predict(X_test)
MSE_model_2 = mean_squared_error(y_test, y_pred_2,squared = False)
print('MSE - Size, # of Rooms, and Building Type:', round(MSE_model_2, 2))

MSE - Size, # of Rooms, and Building Type: 50409.44


**Model 3: Size, Building Type, and their interaction**

In [85]:
# Dummify variables
ct_dummies = ColumnTransformer(
    [('dummify', OneHotEncoder(sparse_output=False), ['Bldg Type'])],
    remainder = 'passthrough'
).set_output(transform = 'pandas')

# Transforming columns
ct_inter = ColumnTransformer(
    [
     ('interaction', PolynomialFeatures(interaction_only = True), ['remainder__Gr Liv Area', 'dummify__Bldg Type_1Fam']),
    ],
    remainder = 'drop'
).set_output(transform = 'pandas')

# Creating a pipeline
lr_pipeline_3 = Pipeline(
[
    ('preprocessing', ct_dummies),
    ('interaction', ct_inter),
    ('linear_regression', LinearRegression())]
)

# Testing coefficients
lr_fitted_3 = lr_pipeline_3.fit(X_train, y_train)
lr_fitted_3.named_steps['linear_regression'].coef_

# Getting predictions and rmse
y_pred_3 = lr_fitted_3.predict(X_test)
MSE_model_3 = mean_squared_error(y_test, y_pred_3,squared = False)
print('MSE - Size, Building Type, and Interaction:', round(MSE_model_3, 2))

MSE - Size, Building Type, and Interaction: 51955.82


**Model 4: 5-degree polynomial on size, 5-degree polynomial on number of rooms, and also building type**

In [97]:
# Transforming columns
ct_poly = ColumnTransformer(
    [('dummify', OneHotEncoder(sparse_output=False), ['Bldg Type']),
     ('polynomial', PolynomialFeatures(5), ['Gr Liv Area', 'TotRms AbvGrd'])
    ],
    remainder = 'drop'
)

# Creating a pipeline
lr_pipeline_4 = Pipeline(
[
    ('preprocessing', ct_poly),
    ('linear_regression', LinearRegression())]
)

# Testing coefficients
lr_fitted_4 = lr_pipeline_4.fit(X_train, y_train)
lr_fitted_4.named_steps['linear_regression'].coef_

# Getting predictions and rmse
y_pred_4 = lr_fitted_4.predict(X_test)
MSE_model_4 = mean_squared_error(y_test, y_pred_4, squared = False)

print('MSE Model 4:', MSE_model_4)

MSE Model 4: 51028.157763009396


In [98]:
print("MSE Model 1: ", MSE_model_1)
print("MSE Model 2: ", MSE_model_2)
print("MSE Model 3: ", MSE_model_3)
print("MSE Model 4: ", MSE_model_4)

MSE Model 1:  51770.57973431293
MSE Model 2:  50409.44308485344
MSE Model 3:  51955.81943659089
MSE Model 4:  51028.157763009396


**It appears that model 2 has performed the best, as it produced the smallest RMSE out of all models.**

**Part 2: Using Cross Validation**

Once again consider four modeling options for house price:

- Using only the size and number of rooms.
- Using size, number of rooms, and building type.
- Using size and building type, and their interaction.
- Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [99]:
# RMSE for all models using cross validation

RMSE_1 = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='neg_root_mean_squared_error')
RMSE_2 = cross_val_score(lr_pipeline_2, X, y, cv=5, scoring='neg_root_mean_squared_error')
RMSE_3 = cross_val_score(lr_pipeline_3, X, y, cv=5, scoring='neg_root_mean_squared_error')
RMSE_4 = cross_val_score(lr_pipeline_4, X, y, cv=5, scoring='neg_root_mean_squared_error')

print("RMSE Model 1:", RMSE_1.mean())
print("RMSE Model 2:", RMSE_2.mean())
print("RMSE Model 3:", RMSE_3.mean())
print("RMSE Model 4:", RMSE_4.mean())

RMSE Model 1: -55483.71795870432
RMSE Model 2: -54084.431949656384
RMSE Model 3: -55534.55100524614
RMSE Model 4: -58320.088859773554


**Yes, this agrees with my conclusion from earlier. Model 2 has performed the best.**

**Last Part: One hundred modeling options**

In [103]:
# Transforming columns
ct_poly = ColumnTransformer(
    [('dummify', OneHotEncoder(sparse_output=False), ['Bldg Type']),
     ('polynomial', PolynomialFeatures(), ['Gr Liv Area', 'TotRms AbvGrd'])
    ],
    remainder = 'drop'
).set_output(transform = 'pandas')

# Creating a pipeline
lr_pipeline_5 = Pipeline(
[
    ('preprocessing', ct_poly),
    ('linear_regression', LinearRegression())]
).set_output(transform='pandas')

degrees = {'preprocessing__polynomial__degree': np.arange(1,10)}

# Testing coefficients
lr_fitted_5 = lr_pipeline_5.fit(X_train, y_train)
lr_fitted_5.named_steps['linear_regression'].coef_

# Getting predictions and rmse
y_pred_5 = lr_fitted_5.predict(X_test)

gscv = GridSearchCV(lr_pipeline_5, degrees, cv = 100, scoring='r2')

gscv_fitted = gscv.fit(X_test, y_test)

pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.191014
1,2,0.201062
2,3,0.180803
3,4,-0.088129
4,5,-0.210704
5,6,-0.223711
6,7,-0.27418
7,8,-0.292981
8,9,-0.322246


The model that performed the best was with all variables and was a degree 2 polynomial. The corresponding R^2 value was 0.201, which isn't great. That indicates that only 20% of the variation in Sales Price can be explained by the model.

This isn't the most optimal way to go about testing models. It can overfit the data or produce a well fit model by chance alone. 