In [16]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer

Fitting a model with training data

In [18]:
lr = LinearRegression()


ames = pd.read_csv(r"C:\Users\tuke-\Desktop\1_GSB544_Computing_and_Machine_Learning\Week_7\data\AmesHousing.csv")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

      Gr Liv Area  TotRms AbvGrd
2267         1312              5
566          1224              4
1401         1363              5
1904          808              5
805          2020              8


array([ 71947.29217458, -19160.73268936])

Applying to test data

In [5]:
y_preds = lr_fitted.predict(X_test)

r2_score(y_test, y_preds)

-2011553.462650946

Looking at the predictions

In [7]:
y_preds[1:5]

array([6.25669162e+07, 1.23563465e+08, 1.19152217e+08, 6.25669162e+07])

New house w/ 889sf and 6 bedrooms

In [8]:
new_house = pd.DataFrame(data = {"Gr Liv Area": [889], "TotRms AbvGrd": [6]})
new_house

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,889,6


A single house does not havea standard deviation

In [9]:
new_house_s = (new_house - new_house.mean())/new_house.std()
new_house_s

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,,


This data has to go through the same pre-processing as the original data. We are using the taining data to standardize the new data.

In [10]:
X_test_s = (X_test - X_train.mean())/X_train.std()
y_preds = lr_fitted.predict(X_test_s)

r2_score(y_test, y_preds)

0.4717449483043151

In [11]:
new_house_s = (new_house - X_train.mean())/X_train.std()
lr_fitted.predict(new_house_s)

array([99606.02265089])

Pipelines

In [12]:
lr_pipeline = Pipeline(
  [StandardScaler(),
  LinearRegression()]
)

lr_pipeline

In [13]:
lr_pipeline = Pipeline(
  [("standardize", StandardScaler()),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [14]:
lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = lr_pipeline_fitted.predict(X_test)
r2_score(y_test, y_preds)

0.4717449483043151

In [15]:
lr_pipeline_fitted.predict(new_house)

array([99606.02265089])

In [19]:
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [21]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)

In [22]:
ct_fitted = ct.fit(X_train)

ct.transform(X_train)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.21513426, -0.26312058],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.48144066,  0.37918599],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.97141559,  1.02149255],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.91729753, -0.90542714],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.60483393, -0.26312058],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.10329999,  0.37918599]])

In [23]:
ct.transform(X_test)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.22309511,  0.37918599],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
        -0.8993856 , -0.26312058],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.88981681,  1.02149255],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.54512751, -0.26312058],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.11164313, -0.90542714],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.66454035, -0.90542714]])

In [24]:
lr_pipeline_fitted.coef_

AttributeError: 'Pipeline' object has no attribute 'coef_'

In [25]:
lr_pipeline_fitted.named_steps['linear_regression'].coef_

array([ 72690.94704601, -17494.78427471])

In [26]:
type(y_preds)

numpy.ndarray

In [27]:
lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


ct.fit_transform(X_train)

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
2618,1.0,0.0,0.0,0.0,0.0,0.215134,-0.263121
2840,1.0,0.0,0.0,0.0,0.0,-0.481441,0.379186
2378,1.0,0.0,0.0,0.0,0.0,0.971416,1.021493
1866,1.0,0.0,0.0,0.0,0.0,-0.391881,-0.263121
2476,1.0,0.0,0.0,0.0,0.0,0.185281,0.379186
...,...,...,...,...,...,...,...
2069,1.0,0.0,0.0,0.0,0.0,-0.628716,-0.263121
2630,1.0,0.0,0.0,0.0,0.0,-0.286400,-0.263121
648,1.0,0.0,0.0,0.0,0.0,-0.917298,-0.905427
334,1.0,0.0,0.0,0.0,0.0,-0.604834,-0.263121


In [28]:
ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter.fit_transform(X_train)

Unnamed: 0,interaction__1,interaction__Gr Liv Area,interaction__TotRms AbvGrd,interaction__Gr Liv Area TotRms AbvGrd
2618,1.0,1600.0,6.0,9600.0
2840,1.0,1250.0,7.0,8750.0
2378,1.0,1980.0,8.0,15840.0
1866,1.0,1295.0,6.0,7770.0
2476,1.0,1585.0,7.0,11095.0
...,...,...,...,...
2069,1.0,1176.0,6.0,7056.0
2630,1.0,1348.0,6.0,8088.0
648,1.0,1031.0,5.0,5155.0
334,1.0,1188.0,6.0,7128.0


In [29]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,...,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
2618,1.0,0.0,0.0,0.0,0.0,2619,535426195,20,RL,47.0,...,0,0,,,,0,12,2006,WD,Normal
2840,1.0,0.0,0.0,0.0,0.0,2841,909101050,50,RL,45.0,...,0,0,,,,0,7,2006,WD,Normal
2378,1.0,0.0,0.0,0.0,0.0,2379,528102010,20,RL,129.0,...,0,0,,,,0,9,2006,WD,Normal
1866,1.0,0.0,0.0,0.0,0.0,1867,534127260,20,RL,130.0,...,200,0,,,,0,4,2007,WD,Normal
2476,1.0,0.0,0.0,0.0,0.0,2477,531376050,60,RL,74.0,...,0,0,,,,0,3,2006,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2069,1.0,0.0,0.0,0.0,0.0,2070,905226170,20,RL,85.0,...,0,0,,,,0,10,2007,VWD,Normal
2630,1.0,0.0,0.0,0.0,0.0,2631,535477060,60,RL,69.0,...,0,0,,,,0,7,2006,WD,Normal
648,1.0,0.0,0.0,0.0,0.0,649,535353040,20,RL,60.0,...,0,0,,MnPrv,,0,3,2009,WD,Normal
334,1.0,0.0,0.0,0.0,0.0,335,923251080,20,RL,,...,0,0,,,,0,4,2010,WD,Normal


Trying different Values:

In [30]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [31]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_b

{'mean_fit_time': array([0.01699009, 0.01712098, 0.01587682, 0.01360707, 0.01019654,
        0.01430221, 0.01342597, 0.01392817, 0.01202307]),
 'std_fit_time': array([0.00095258, 0.00385565, 0.00070863, 0.00254173, 0.00758659,
        0.00795732, 0.00459631, 0.0031538 , 0.00373013]),
 'mean_score_time': array([0.01972437, 0.0080977 , 0.00813832, 0.0063694 , 0.00897965,
        0.003127  , 0.00506406, 0.00829306, 0.00649166]),
 'std_score_time': array([0.01784343, 0.0049482 , 0.00135672, 0.0052227 , 0.00612287,
        0.00625401, 0.00643001, 0.00019149, 0.00482863]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'preprocessing__polynomial__degree': 1},
  {'preprocessing__polynomial__degree': 2},
  {'preprocessing__polynomial__degree': 3},
  {'preprocessing__polynomial__degree

In [32]:
gscv_fitted.cv_results_['mean_test_score']

array([ 0.52988868,  0.5314061 ,  0.55123636,  0.5423836 ,  0.45186012,
        0.33383744,  0.02932182, -0.96809611, -4.54559379])

In [33]:
pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.529889
1,2,0.531406
2,3,0.551236
3,4,0.542384
4,5,0.45186
5,6,0.333837
6,7,0.029322
7,8,-0.968096
8,9,-4.545594
