In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [6]:
from google.colab import files
uploaded = files.upload()


Saving AmesHousing.csv to AmesHousing.csv


In [8]:
df = pd.read_csv("AmesHousing.csv")
df.head()


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [18]:
X = df[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y = df["SalePrice"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)


In [21]:
# MODEL 1
numerical_features_m1 = ["Gr Liv Area", "TotRms AbvGrd"]
preprocessor_m1 = ColumnTransformer(
    transformers=[('num', StandardScaler(), numerical_features_m1)],
    remainder='drop')

m1 = make_pipeline(preprocessor_m1, LinearRegression())
m1.fit(X_train, y_train)
y1_pred = m1.predict(X_test)
rmse1 = np.sqrt(mean_squared_error(y_test, y1_pred))


# MODEL 2
numerical_features = ["Gr Liv Area", "TotRms AbvGrd"]
categorical_features = ["Bldg Type"]

preprocessor_m2 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)])

m2 = make_pipeline(preprocessor_m2, LinearRegression())
m2.fit(X_train, y_train)
y2_pred = m2.predict(X_test)
rmse2 = np.sqrt(mean_squared_error(y_test, y2_pred))

# MODEL 3 (chat helped)
def add_interaction(X):
    X = X.copy()
    X["Interaction"] = X["Gr Liv Area"] * pd.get_dummies(X["Bldg Type"], drop_first=True).iloc[:, 0]
    return X

interaction = FunctionTransformer(add_interaction)
numerical_features = ["Gr Liv Area"]
categorical_features = ["Bldg Type"]

preprocessor_m3 = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)])

m3 = make_pipeline(interaction, preprocessor_m3, LinearRegression())
m3.fit(X_train, y_train)
y3_pred = m3.predict(X_test)
rmse3 = np.sqrt(mean_squared_error(y_test, y3_pred))

# MODEL 4 (chat helped)
numerical_features = ["Gr Liv Area", "TotRms AbvGrd"]
categorical_features = ["Bldg Type"]

preprocessor_m4 = ColumnTransformer(transformers=[
    ('poly', PolynomialFeatures(degree=5, include_bias=False), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)])

m4 = make_pipeline(preprocessor_m4, StandardScaler(with_mean=False), LinearRegression())
m4.fit(X_train, y_train)
y4_pred = m4.predict(X_test)
rmse4 = np.sqrt(mean_squared_error(y_test, y4_pred))


print("Model 1 RMSE:", round(rmse1, 2))
print("Model 2 RMSE:", round(rmse2, 2))
print("Model 3 RMSE:", round(rmse3, 2))
print("Model 4 RMSE:", round(rmse4, 2))

Model 1 RMSE: 54067.23
Model 2 RMSE: 52243.14
Model 3 RMSE: 52689.83
Model 4 RMSE: 52043.91


Model 4 performed the best with lowest RMSE

In [24]:
models = {
    "Model 1": m1,
    "Model 2": m2,
    "Model 3": m3,
    "Model 4": m4
}

rmse_cv = {}

for name, model in models.items():
    cv_rmse = -cross_val_score(
        model,
        X,
        y,
        cv=5,
        scoring="neg_root_mean_squared_error"
    )
    rmse_cv[name] = cv_rmse.mean()

for name, score in rmse_cv.items():
    print(f"{name}: {round(score, 2)}")


Model 1: 55806.33
Model 2: 54168.08
Model 3: 54344.55
Model 4: 70854.54


Model 4 did well on one test split but poorly in cross validation.
This means we probably overfit the training data and does not generalize well to new data.

In [25]:
pre = ColumnTransformer([
    ("size",  PolynomialFeatures(include_bias=False), ["Gr Liv Area"]),
    ("rooms", PolynomialFeatures(include_bias=False), ["TotRms AbvGrd"]),
    ("cat",   OneHotEncoder(handle_unknown="ignore", drop="first"), ["Bldg Type"])
])

pipe = Pipeline([
    ("pre", pre),
    ("scale", StandardScaler(with_mean=False)),
    ("lr", LinearRegression())
])

param_grid = {
    "pre__size__degree":  np.arange(1, 11),
    "pre__rooms__degree": np.arange(1, 11)
}

gscv = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

gscv.fit(X, y)

best_deg_size  = gscv.best_params_["pre__size__degree"]
best_deg_rooms = gscv.best_params_["pre__rooms__degree"]
best_rmse = -gscv.best_score_

print(best_deg_size, best_deg_rooms)
print(round(best_rmse, 2))


3 1
52781.98
