In [110]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

In [119]:
ames = pd.read_csv(r"C:\Users\tuke-\Desktop\1_GSB544_Computing_and_Machine_Learning\Week_7\data\AmesHousing.csv")

X = ames.drop('SalePrice', axis = 1)
y = ames['SalePrice']

X_test, X_train, y_test, y_train = train_test_split(X,y)

lr = LinearRegression()
enc = OneHotEncoder(sparse_output = False)


In [120]:
ct = ColumnTransformer(
    [("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
      remainder = "drop"
)

lr_pipeline = Pipeline(
    [("preprocessing", ct),
     ("linear_regression", lr)]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)

MSE1 = mean_squared_error(y_test, y_pred)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
R2_1 = scores.mean()

In [121]:
ct = ColumnTransformer(
    [("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    ("dummies", enc, ["Bldg Type"])],
      remainder = "drop"
)
lr_pipeline = Pipeline(
    [("preprocessing", ct),
     ("linear_regression", lr)]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)

MSE2 = mean_squared_error(y_test, y_pred)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
R2_2 = scores.mean()

In [122]:
ct_dummies = ColumnTransformer(
    [("standardize", StandardScaler(), ["Gr Liv Area"]),
    ("dummies", enc, ["Bldg Type"])],
      remainder = "drop"
).set_output(transform = "pandas")

#transformed_data = ct_dummies.fit_transform(X_train)

#print(transformed_data)

ct_interact = ColumnTransformer(
    [("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummies__Bldg Type_1Fam"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummies__Bldg Type_2fmCon"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummies__Bldg Type_Duplex"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummies__Bldg Type_Twnhs"]),
    ("interaction5", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummies__Bldg Type_TwnhsE"])],
    remainder = "passthrough"
).set_output(transform = "pandas")


lr_pipeline = Pipeline(
    [("preprocessing1", ct_dummies),
     ("preprocessing2", ct_interact),
     ("linear_regression", lr)]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)

MSE3 = mean_squared_error(y_test, y_pred)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
R2_3 = scores.mean()

In [123]:
degree = 5
poly = PolynomialFeatures((1,degree))

ct_1 = ColumnTransformer(
    [('poly1', poly, ['Gr Liv Area']),
    ('poly2', poly, ['TotRms AbvGrd']),
    ("dummies", enc, ["Bldg Type"])], 
    remainder = 'drop'
).set_output(transform = "pandas")

ct_2 = ColumnTransformer(
    [("standardize", StandardScaler(), ["poly1__Gr Liv Area","poly1__Gr Liv Area^2","poly1__Gr Liv Area^3","poly1__Gr Liv Area^4","poly1__Gr Liv Area^5", 
                                        "poly2__TotRms AbvGrd","poly2__TotRms AbvGrd^2","poly2__TotRms AbvGrd^3","poly2__TotRms AbvGrd^4","poly2__TotRms AbvGrd^5"])],
      remainder = "passthrough"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
    [("preprocessing1", ct_1),
     ("preprocessing2", ct_2),
     ("linear_regression", lr)]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)

MSE4 = mean_squared_error(y_test, y_pred)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
R2_4 = scores.mean()

In [130]:
df = pd.DataFrame({'MSE': [MSE1, MSE2, MSE3, MSE4],
                   'R2': [R2_1, R2_2, R2_3, R2_4]},
                  index=['Model_1','Model_2','Model_3','Model_4'])
print(df)
print("\n")
print('Minimum MSE is:')
print(df['MSE'].idxmin())
print('Maximum R2 is:')
print(df['R2'].idxmax())

                  MSE        R2
Model_1  3.245446e+09  0.504209
Model_2  3.039015e+09  0.532882
Model_3  2.963748e+09  0.544867
Model_4  2.654752e+10  0.510664


Minimum MSE is:
Model_3
Maximum R2 is:
Model_3


Minimum MSE and Maximum R2 is for model 3 Both support the conclusion that model 3 is the best.

In [138]:
#degree = 5
poly = PolynomialFeatures()

ct_1 = ColumnTransformer(
    [('poly1', poly, ['Gr Liv Area']),
    ('poly2', poly, ['TotRms AbvGrd']),
    ("dummies", enc, ["Bldg Type"])], 
    remainder = 'drop'
).set_output(transform = "pandas")



lr_pipeline = Pipeline(
    [("preprocessing", ct_1),
     ("linear_regression", lr)]
)

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)

MSE4 = mean_squared_error(y_test, y_pred)

degrees = {'preprocessing__poly1__degree': np.arange(1, 11), 'preprocessing__poly2__degree': np.arange(1, 11) }

gscv = GridSearchCV(lr_pipeline, degrees, cv = 5, scoring='r2')

In [155]:
gscv_fitted = gscv.fit(X, y)

In [159]:
scores = gscv_fitted.cv_results_['mean_test_score']

reshape_scores = scores.reshape(-1,10)

df = pd.DataFrame(reshape_scores)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.532882,0.532383,0.535924,0.541529,0.541066,0.534862,0.080069,-1.090287,0.270155,-184.222187
1,0.537472,0.533567,0.534134,0.535418,0.530267,0.533314,0.352499,-0.177038,0.491276,-189.473644
2,0.557641,0.556857,0.554039,0.550392,0.546549,0.545171,0.4484,0.429703,-1.044729,-395.936023
3,0.549495,0.550068,0.550375,0.556855,0.556531,0.553164,0.554033,0.278806,0.039072,-0.110528
4,0.45186,0.45186,0.505231,0.496658,0.492695,0.522404,0.517853,0.435842,0.41928,0.3075
5,0.333837,0.333837,0.333837,0.333837,0.333837,0.486497,0.491897,0.311996,0.258417,0.054021
6,0.029322,0.029322,0.029322,0.029322,0.029322,0.029322,0.029322,0.029322,0.363047,0.400107
7,-0.968096,-0.968096,-0.968096,-0.968096,-0.968096,-0.968096,-0.968096,-0.968096,-0.968096,-0.968096
8,-4.545599,-4.545599,-4.545599,-4.545599,-4.545598,-4.545598,-4.545598,-4.545598,-4.545598,-4.545598
9,-16.18786,-16.18786,-16.18786,-16.187879,-16.187879,-16.187879,-16.187879,-16.187879,-16.18793,-16.18793


In [160]:
print(df.max())

print(df.max().idxmax())

0    0.557641
1    0.556857
2    0.554039
3    0.556855
4    0.556531
5    0.553164
6    0.554033
7    0.435842
8    0.491276
9    0.400107
dtype: float64
0


Q1: The best model is when we don't use polynomial on square footage and use the 3rd polynomial for the number of rooms.

Q2: There is a lot of models that cause overfitting and unrealistic r2 values. This means that we are doing a decent amount of extra processing as well as complicating the model and possibly causing user errors. It would be smart to form a hypothesis to find a reasonable starting point that could allow for smaller models to be used first.