In [17]:
import sklearn
import pandas as pd
import numpy as np

In [85]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score

In [19]:
from plotnine import ggplot, aes, geom_point, geom_line, geom_histogram, geom_bar, labs

In [20]:
housing = pd.read_csv('AmesHousing.csv')
housing.head(3)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000


In [37]:
variables = ['SalePrice', 'Lot Area', 'TotRms AbvGrd', 'Bldg Type']
df = housing[variables]
df.columns  = ['Price', 'Size', 'Rooms', 'Type']
df.head(3)

Unnamed: 0,Price,Size,Rooms,Type
0,215000,31770,7,1Fam
1,105000,11622,5,1Fam
2,172000,14267,6,1Fam


# Part 1

In [108]:
ct1 = ColumnTransformer(
  [('passed', 'passthrough', ['Size', 'Rooms'])], 
    remainder = 'drop'
).set_output(transform = "pandas")

ct2 = ColumnTransformer(
  [('passed', 'passthrough', ['Size', 'Rooms']),
   ("dummify", OneHotEncoder(sparse_output = False), ['Type'])], 
    remainder = 'passthrough'
).set_output(transform = "pandas")

ct3A = ColumnTransformer(
  [('passed', 'passthrough', ['Size']),
   ("dummify", OneHotEncoder(sparse_output = False), ['Type'])],
  remainder = "drop"
).set_output(transform = "pandas")

ct3B = ColumnTransformer(
  [("interaction", PolynomialFeatures(interaction_only = True),
    ['passed__Size', 'dummify__Type_1Fam','dummify__Type_2fmCon','dummify__Type_Duplex',
     'dummify__Type_Twnhs', 'dummify__Type_TwnhsE'])],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct4 = ColumnTransformer(
  [('poly', PolynomialFeatures(degree = 5, include_bias = False), ['Size', 'Rooms']),
   ("dummify", OneHotEncoder(sparse_output = False), ['Type'])]
).set_output(transform = "pandas")

pipeline1 = Pipeline(
  [("preprocessing", ct1),
  ("linear_regression", LinearRegression())]
    ).set_output(transform = "pandas")

pipeline2 = Pipeline(
  [("preprocessing", ct2),
  ("linear_regression", LinearRegression())]
)

pipeline3 = Pipeline(
  [("dummy", ct3A),
   ("interaction", ct3B),
  ("linear_regression", LinearRegression())]
)

pipeline4 = Pipeline(
  [("preprocessing", ct4),
  ("linear_regression", LinearRegression())]
)

In [109]:
y = df['Price']
X = df[['Size', 'Rooms', 'Type']]

X_train, X_test, y_train, y_test = train_test_split(X, y)

#Model 1
pipeline1_fitted = pipeline1.fit(X_train, y_train)
y_pred1 = pipeline1_fitted.predict(X_test)
rmse1 = np.sqrt(mean_squared_error(y_test, y_pred1))

#Model 2
pipeline2_fitted = pipeline2.fit(X_train, y_train)
y_pred2 = pipeline2_fitted.predict(X_test)
rmse2 = np.sqrt(mean_squared_error(y_test, y_pred2))

#Model 3
pipeline3_fitted = pipeline3.fit(X_train, y_train)
y_pred3 = pipeline3_fitted.predict(X_test)
rmse3 = np.sqrt(mean_squared_error(y_test, y_pred3))

#Model 4
pipeline4_fitted = pipeline4.fit(X_train, y_train)
y_pred4 = pipeline4_fitted.predict(X_test)
rmse4 = np.sqrt(mean_squared_error(y_test, y_pred4))

(rmse1, rmse2, rmse3, rmse4)

(67420.49210793439, 62429.41648246205, 74563.95610639901, 116779.1280460617)

Model 2 with just size, number of rooms, and building type had the lowest RMSE of 62754.76

# Part 2

In [110]:
scores1 = cross_val_score(pipeline1, X, y, cv = 5, scoring = 'r2')
scores2 = cross_val_score(pipeline2, X, y, cv = 5, scoring = 'r2')
scores3 = cross_val_score(pipeline3, X, y, cv = 5, scoring = 'r2')
scores4 = cross_val_score(pipeline4, X, y, cv = 5, scoring = 'r2')

In [111]:
(scores1.mean(), scores2.mean(), scores3.mean(), scores4.mean())

(0.2546019335670011,
 0.34343289518484826,
 0.09456881316588757,
 -1.8194119931732249)

From the Average of the cross validation scores, it seems that model 2 is again the best performaing model. However, for model 4 I am getting a negative R^squared model, which means there is a problem with the model. 

# Part 3

In [118]:
y = df['Price']
X = df[['Size', 'Rooms', 'Type']]

ct = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ['Type']),
  ("polynomial", PolynomialFeatures(), ['Size', 'Rooms'])]
).set_output(transform = "pandas")

poly_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(poly_pipeline, degrees, cv = 5, scoring = 'r2')

gscv_fitted = gscv.fit(X, y)

pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.343433
1,2,0.342278
2,3,0.326389
3,4,0.024058
4,5,-1.799999
5,6,-8.672201
6,7,-221.449758
7,8,-1905.647359
8,9,-7659.777395


The 1 degree model performed the best, and the R^2 value decreased as the number of degrees increased. Just like in part 2, a Degree 5 model produces a negative R^2 value. 