# Activity 7.1
### Kaden Buckley

In [101]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from plotnine import *

In [21]:
#Load ames data
ames = pd.read_csv("/Users/kadenbuckley/Desktop/MSBA/Fall_Quarter/GSB_544/Data/AmesHousing.csv")

#Rename columns
ames_clean = ames.rename(columns = {'Gr Liv Area': 'Size',
                                    'TotRms AbvGrd': 'Num_Rooms',
                                    'Bldg Type': 'Building_Type'
                                   })

X = ames_clean.drop("SalePrice", axis =1)
y = ames_clean["SalePrice"]

#Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Part 1: Pipelines

Consider four possible models for predicting house prices:

- 1.) Using only the size and number of rooms.
- 2.) Using size, number of rooms, and building type.
- 3.) Using size and building type, and their interaction.
- 4.) Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

## Model 1: Size & Number of Rooms

In [95]:
#Model 1 Column Transformer
ct_1 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), []),
    ("standardize", StandardScaler(), ["Size", "Num_Rooms"])
  ],
  remainder = "drop"
)

In [96]:
#Model 1 Pipeline
lr_pipeline_1 = Pipeline(
  [("preprocessing", ct_1),
  ("linear_regression", LinearRegression())
   ]
).set_output(transform="pandas")

#Fit linear pipeline
lr_pipeline_1.fit(X_train, y_train)

#Get predicitions
model_1_predictions = lr_pipeline_1.predict(X_test)

model_1_rmse = np.sqrt(mean_squared_error(y_test, model_1_predictions))

print(f"RMSE of Model 1: {model_1_rmse}")

RMSE of Model 1: 53527.75931385835


## Model 2: Size, Number of Rooms, & Building Type

In [42]:
#Model 2 Column Transformer
ct_2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Building_Type"]),
    ("standardize", StandardScaler(), ["Size", "Num_Rooms"])
  ],
  remainder = "drop"
)

In [43]:
#Model 2 Pipeline
lr_pipeline_2 = Pipeline(
  [("preprocessing", ct_2),
  ("linear_regression", LinearRegression())
   ]
).set_output(transform="pandas")

#Fit linear pipeline
lr_pipeline_2.fit(X_train, y_train)

#Get predicitions
model_2_predictions = lr_pipeline_2.predict(X_test)

model_2_rmse = np.sqrt(mean_squared_error(y_test, model_2_predictions))

print(f"RMSE of Model 2: {model_2_rmse}")

RMSE of Model 2: 51309.2585360023


## Model 3: Size, Building Type, & Size * Building Type

In [64]:
#Model 3 Column Transformer
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building_Type"]),
   ("standardize", StandardScaler(), ["Size"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified.all()

#Model 3 Interaction Column Transformer
ct_interaction = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__Num_Rooms", "dummify__Building_Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

In [65]:
lr_pipeline_3 = Pipeline(
  [("preprocessing", ct_dummies),
   ("interaction", ct_interaction),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline_3.fit(X_train, y_train)

#Get predicitions
model_3_predictions = lr_pipeline_3.predict(X_test)

model_3_rmse = np.sqrt(mean_squared_error(y_test, model_3_predictions))

print(f"RMSE of Model 3: {model_3_rmse}")

RMSE of Model 3: 62572.88242944864


## Model 4: 5-degree polynomial on size, a 5-degree polynomial on number of rooms, & building type.

In [79]:
#Model 4 Column Transformer
ct_poly = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building_Type"]),
   ("polynomial transformer", PolynomialFeatures(degree = 5), ["Size", "Num_Rooms"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

In [80]:
#Model 4 Pipeline
lr_pipeline_4 = Pipeline(
    [
        ("preprocessing", ct_poly),
        ("linear_regression", LinearRegression())
    ]
).set_output(transform = "pandas")

lr_pipeline_4.fit(X_train, y_train)

#Get predicitions
model_4_predictions = lr_pipeline_4.predict(X_test)

model_4_rmse = np.sqrt(mean_squared_error(y_test, model_4_predictions))

print(f"RMSE of Model 4: {model_4_rmse}")

RMSE of Model 4: 53604.84193219687


## Model Comparison

In [93]:
#Create Results Dataframe
model_results = pd.DataFrame({'Model': ["Model 1", "Model_2", "Model_3", "Model_4"], 'RMSE': [model_1_rmse, model_2_rmse, model_3_rmse, model_4_rmse]})

model_results = model_results.sort_values(by='RMSE', ascending=True)
print(model_results)

     Model          RMSE
1  Model_2  51309.258536
0  Model 1  53527.759314
3  Model_4  53604.841932
2  Model_3  62572.882429


The sorted dataframe above contains each of the four model's RMSE values. Model 2, using Size, Number of Rooms, & Building Type, performed the best, as it has the lowest RMSE value.

# Part 2: Cross-Validation

Once again consider four modeling options for house price:

- 1.) Using only the size and number of rooms.

- 2.) Using size, number of rooms, and building type.

- 3.) Using size and building type, and their interaction.

- 4.) Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [100]:
#Model 1
scores_1 = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='neg_mean_squared_error')
scores_1_squared = np.sqrt(-scores_1)
rmse_1 = scores_1_squared.mean()
rmse_1

#Model 2
scores_2 = cross_val_score(lr_pipeline_2, X, y, cv=5, scoring='neg_mean_squared_error')
scores_2_squared = np.sqrt(-scores_2)
rmse_2 = scores_2_squared.mean()
rmse_2

#Model 3
scores_3 = cross_val_score(lr_pipeline_3, X, y, cv=5, scoring='neg_mean_squared_error')
scores_3_squared = np.sqrt(-scores_3)
rmse_3 = scores_3_squared.mean()
rmse_3

#Model 4
scores_4 = cross_val_score(lr_pipeline_4, X, y, cv=5, scoring='neg_mean_squared_error')
scores_4_squared = np.sqrt(-scores_4)
rmse_4 = scores_4_squared.mean()
rmse_4

#Create CV Results Dataframe
model_cv_results = pd.DataFrame({'Model': ["Model 1", "Model_2", "Model_3", "Model_4"], 'CV RMSE': [rmse_1, rmse_2, rmse_3, rmse_4]})

model_cv_results = model_cv_results.sort_values(by='CV RMSE', ascending=True)
print(model_cv_results)

     Model       CV RMSE
1  Model_2  54156.048470
0  Model 1  55806.326349
3  Model_4  61197.680682
2  Model_3  65218.895416


The sorted dataframe above contains each of the four model's RMSE values found by using the cross_val_score method. Model 2, still performed the best, as it has the lowest RMSE value.

# Part 3: GridSearchCV

Consider one hundred modeling options for house price:

- House size, trying degrees 1 through 10
- Number of rooms, trying degrees 1 through 10
- Building Type
- Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

**Q1:** Which model performed the best?

**Q2:** What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [145]:
#Column Transformer
ct_poly = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Building_Type"]),
   ("polynomial_size", PolynomialFeatures(), ["Size"]),
   ("polynomial_rooms", PolynomialFeatures(), ["Num_Rooms"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

#Pipeline
lr_pipeline_poly = Pipeline(
    [
    ("preprocessing", ct_poly),
    ("standardize", StandardScaler()),
    ("linear_regression", LinearRegression())]
).set_output(transform = "pandas")

degrees = {'preprocessing__polynomial_size__degree': np.arange(1, 11),
          'preprocessing__polynomial_rooms__degree': np.arange(1, 11)
          }

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring = 'r2')

#Fit grid search tuning procedure to the data
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_['mean_test_score']

#Extracting parameter settings and mean scores
mean_scores = gscv_fitted.cv_results_['mean_test_score']
size_degree = [params['preprocessing__polynomial_size__degree'] for params in gscv_fitted.cv_results_['params']]
rooms_degree = [params['preprocessing__polynomial_rooms__degree'] for params in gscv_fitted.cv_results_['params']]

#Creating a DataFrame with the results
results_df = pd.DataFrame(data={
    "size_degrees": size_degree,
    "rooms_degrees": rooms_degree,
    "scores": mean_scores
})

results_df = results_df.sort_values(by = "scores", ascending = False)
results_df

Unnamed: 0,size_degrees,rooms_degrees,scores
2,3,1,0.557875
12,3,2,0.557099
33,4,4,0.557097
43,4,5,0.555479
22,3,3,0.554256
...,...,...,...
39,10,4,-567.299824
59,10,6,-572.953090
49,10,5,-584.037138
89,10,9,-587.625230


- **Q1:** According to the results, the model with size degrees 3 and rooms degree 1 performed the best. The r^2 score is 0.557875
- **Q2:** Trying all possible model options creates tests a large quantity of models, and therefore the computer needs to work harder to obtain results. This results in up longer processing times than testing fewer models. We could limit the number of degrees, and overall quanitity of models, to speed up processing time.