In [164]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel


In [165]:
data = pd.read_csv(r"C:\Users\Marvin\Documents\WBS\Data-Science-Bootcamp\7_Supervised ML\Data\housing_iteration_6_regression\housing_iteration_6_regression.csv")

In [166]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [167]:
X = data.drop(columns=['Id']).copy()
y = X.pop("SalePrice")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [168]:
# Select the numerical columns from X
X_num = X_train.select_dtypes(include="number").copy()

# Select the categorical columns from X
X_cat = X_train.select_dtypes(exclude="number").copy()

Pipelines

In [169]:
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

num_pipe = make_pipeline(
    SimpleImputer()
)

ordinal_cols_names = ["LotShape", "Utilities", "ExterQual", 
                      "ExterCond", "BsmtQual", "BsmtCond", 
                      "BsmtExposure", "BsmtFinType1", "BsmtFinType2", 
                      "HeatingQC", "KitchenQual", "FireplaceQu", 
                      "GarageQual", "GarageCond", "PoolQC"]
ordinal_cols = X_cat.columns.get_indexer(ordinal_cols_names)

non_ordinal_cols_names = [col for col in X_cat.columns if col not in ordinal_cols_names]
onehot_cols = X_cat.columns.get_indexer(non_ordinal_cols_names)

ordinal_rankings = [["NA", "IR3", "IR2","IR1", "Reg"],
                    ["NA", "ELO", "NoSeWa", "NoSewr", "AllPub"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "No", "Mn", "Av", "Gd"],
                    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                    ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                    ["NA", "Fa", "TA", "Gd", "Ex"]] 

cat_preprocessor = make_column_transformer(
    (OrdinalEncoder(categories=ordinal_rankings), ordinal_cols),
    (OneHotEncoder(drop="first", sparse=False, handle_unknown="ignore"), onehot_cols )
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"), # play with NA or N_A 
    cat_preprocessor)

# final preprocessor

preprocessor = make_column_transformer(
    (num_pipe, X_num.columns),
    (cat_pipe, X_cat.columns)
)

# Definition des Random Forest Regressors
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Erstellen der Pipeline, die Vorverarbeitung und Modell kombiniert
rf_pipeline = make_pipeline(preprocessor,StandardScaler(), rf_regressor)

# Training des Modells
rf_pipeline.fit(X_train, y_train)




In [170]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

gbr = GradientBoostingRegressor(n_estimators=100, 
                                learning_rate=0.1, 
                                max_depth=3, 
                                min_samples_split=2, 
                                min_samples_leaf=1)

gbr_pipeline = make_pipeline(preprocessor, gbr)

gbr_pipeline.fit(X_train, y_train)



Making Predictions

In [171]:
rfr_pred = rf_pipeline.predict(X_test)
gbr_pred = gbr_pipeline.predict(X_test)



In [172]:
rfr_r2 = r2_score(y_test, rfr_pred)
gbr_r2 = r2_score(y_test, gbr_pred)

performances = pd.DataFrame({'rfr': rfr_r2,
                             'gbr': gbr_r2},
                            index=['full pipes'])

performances

Unnamed: 0,rfr,gbr
full pipes,0.87558,0.904484


Feature Selection

In [173]:
X_num

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
318,60,90.0,9900,7,5,1993,1993,256.0,987,0,...,656,340,60,144,0,0,0,0,4,2009
580,20,,14585,6,6,1960,1987,85.0,594,219,...,572,216,110,0,0,0,0,0,6,2007
961,60,,12227,6,7,1977,1995,424.0,896,0,...,619,550,282,0,0,0,0,0,7,2008
78,90,72.0,10778,4,5,1968,1968,0.0,0,0,...,0,0,0,0,0,0,0,0,4,2010
5,50,85.0,14115,5,5,1993,1995,0.0,732,0,...,480,40,30,0,320,0,0,700,10,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,60,,9130,6,8,1966,2000,252.0,400,64,...,484,0,40,0,0,0,0,0,7,2008
1122,20,,8926,4,3,1956,1956,0.0,0,0,...,288,64,0,0,0,160,0,0,10,2009
1346,20,,20781,7,7,1968,2003,0.0,297,68,...,508,0,80,0,290,0,0,0,6,2006
1406,85,70.0,8445,5,7,1972,2007,0.0,656,0,...,396,58,0,0,0,0,0,0,3,2009


In [174]:
range_var_df = (pd.DataFrame({
                'Range': X_num.max() - X_num.min(),
                'Variance': X_num.var()})
                .sort_values(by='Variance'))

In [175]:
range_var_df.tail()

Unnamed: 0,Range,Variance
TotalBsmtSF,6110.0,200852.6
BsmtFinSF1,5644.0,216907.5
GrLivArea,5308.0,288487.8
MiscVal,15500.0,300135.0
LotArea,213945.0,88582000.0


In [176]:
# Initialize the scaler.
my_scaler = MinMaxScaler().set_output(transform="pandas")

# Fit the scaler to X_train and transform the values.
X_num_scaled = my_scaler.fit_transform(X_num)

In [177]:
(
  pd.DataFrame({
  'Range': X_num_scaled.max() - X_num_scaled.min(),
  'Variance': X_num_scaled.var()})
  .sort_values(by='Variance')
)

Unnamed: 0,Range,Variance
MiscVal,1.0,0.001249
LotArea,1.0,0.001935
3SsnPorch,1.0,0.003677
PoolArea,1.0,0.003701
TotalBsmtSF,1.0,0.00538
LotFrontage,1.0,0.006315
BsmtFinSF1,1.0,0.006809
1stFlrSF,1.0,0.008265
LowQualFinSF,1.0,0.008288
GrLivArea,1.0,0.010239


In [178]:
selector = VarianceThreshold(threshold=0.01)

X_num_var = selector.fit_transform(X_num_scaled)


In [179]:
print("shape before:", X_num_scaled.shape)
print("shape after:", X_num_var.shape)

shape before: (1168, 36)
shape after: (1168, 27)


In [180]:
# Scale the test set
X_test_num = X_test.select_dtypes(include="number").copy()
X_test_scaled = my_scaler.transform(X_test_num)

# Apply the variance threshold to the scaled test set
X_test_var = selector.transform(X_test_scaled)

In [181]:
# rfr
var_rfr = rf_pipeline
var_rfr.fit(X_num_var, y_train)
var_rfr_pred = var_rfr.predict(X_test_var)

# gbr
var_gbr = gbr_pipeline
var_gbr.fit(X_num_var, y_train)
var_gbr_pred = var_gbr.predict(X_test_var)

performances.loc["varThreshold_0_01", "rfr"] = r2_score(y_test, var_rfr_pred)
performances.loc["varThreshold_0_01", "gbr"] = r2_score(y_test, var_gbr_pred)

performances

ValueError: Specifying the columns using strings is only supported for pandas DataFrames