# Week 7 Notes

In [35]:
import pandas as pd
import numpy as np
import plotnine as plt
from plotnine import *
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import warnings
from sklearn.model_selection import cross_val_score

warnings.simplefilter(action='ignore', category=FutureWarning)

In [28]:
ames = pd.read_csv("C:/Users/hblin/Downloads/AmesHousing.csv")
X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 123)


In [None]:
# Size and number of rooms
ct_dummies = ColumnTransformer(
  [("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
  remainder = "drop"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct_dummies),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#print(f"RMSE: {rmse}")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean RMSE
mean_rmse = -scores.mean()  # Take the negative to convert back to positive RMSE
print(f"Mean RMSE: {mean_rmse}")

RMSE: 50591.3232703246
Mean RMSE: 55806.32634926364


In [41]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
  ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
  remainder = "drop"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct_dummies),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean RMSE
mean_rmse = -scores.mean()  # Take the negative to convert back to positive RMSE
print(f"Mean RMSE: {mean_rmse}")

RMSE: 49047.62094866008
Mean RMSE: 54153.20862794976


In [None]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
  ("standardize", StandardScaler(), ["Gr Liv Area"])],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct_dummies),
   ("interaction", PolynomialFeatures(interaction_only = True)),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#print(f"RMSE: {rmse}")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean RMSE
mean_rmse = -scores.mean()  # Take the negative to convert back to positive RMSE
print(f"Mean RMSE: {mean_rmse}")

RMSE: 48427.65178552182


In [42]:

preprocessing = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_size", Pipeline([
            ("standardize", StandardScaler()), 
            ("poly", PolynomialFeatures(degree=5, include_bias=False))
        ]), ["Gr Liv Area"]),
        ("poly_rooms", Pipeline([
            ("standardize", StandardScaler()), 
            ("poly", PolynomialFeatures(degree=5, include_bias=False))
        ]), ["TotRms AbvGrd"])
    ],
    remainder="drop"
).set_output(transform="pandas")

lr_pipeline = Pipeline(
  [("preprocessing", preprocessing),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


lr_pipeline.fit(X_train, y_train)
y_pred = lr_pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#print(f"RMSE: {rmse}")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean RMSE
mean_rmse = -scores.mean()  # Take the negative to convert back to positive RMSE
print(f"Mean RMSE: {mean_rmse}")

Mean RMSE: 55176.96594338035


According to these models, model 3 seems to have performed the best.

With the cross validation, the RMSE's also agree that model 3 has performed the best. 

In [33]:
from sklearn.model_selection import cross_val_score
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores.mean()

np.float64(0.5331485871994233)