In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import PredictionErrorDisplay
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor

In [2]:
ames = pd.read_csv('ames.csv', index_col=0)
ames.head(10)

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,SaleType,SaleCondition,TotalBath,TotalSF,HQSF,yrsbtwn,BedxBath,RoomsxBath,FireplaceYN,TotalPorchSF
1,909176150,856,126000,30,RL,58.769231,7890,Pave,No Alley,Reg,...,WD,Normal,2.0,1712.0,1712.0,11,4.0,8.0,1,166
2,905476230,1049,139500,120,RL,42.0,4235,Pave,No Alley,Reg,...,WD,Normal,3.0,2098.0,2098.0,0,6.0,15.0,0,105
3,911128020,1001,124900,30,C (all),60.0,6060,Pave,No Alley,Reg,...,WD,Normal,1.0,1838.0,1838.0,77,2.0,5.0,0,282
4,535377150,1039,114000,70,RL,80.0,8146,Pave,No Alley,Reg,...,WD,Normal,1.0,1444.0,1444.0,103,2.0,6.0,0,279
5,534177230,1665,227000,60,RL,70.0,8400,Pave,No Alley,Reg,...,WD,Normal,3.5,2475.0,2475.0,0,10.5,21.0,0,45
6,908128060,1922,198500,85,RL,64.0,7301,Pave,No Alley,Reg,...,ConLD,Normal,3.0,1922.0,1922.0,0,12.0,21.0,1,177
7,902135020,936,93000,20,RM,60.0,6000,Pave,Pave,Reg,...,WD,Normal,1.0,1872.0,1872.0,0,2.0,4.0,0,144
8,528228540,1246,187687,20,RL,53.0,3710,Pave,No Alley,Reg,...,New,Partial,2.0,2392.0,2392.0,1,4.0,10.0,1,124
9,923426010,889,137500,20,RL,74.0,12395,Pave,No Alley,Reg,...,WD,Normal,1.0,1753.0,1753.0,0,3.0,6.0,0,0
10,908186050,1072,140000,180,RM,35.0,3675,Pave,No Alley,Reg,...,WD,Normal,2.0,1619.0,1619.0,0,4.0,10.0,0,44


In [45]:
# Load the dataset and remove columns with missing values
ames = pd.read_csv('ames.csv')

# Identify numeric and categorical features, excluding 'PID' and 'SalePrice'
ames['MSSubClass'] = ames['MSSubClass'].astype(str) #Nominal variable of 'string' integers
numeric_features = ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice', 'Unnamed: 0']).columns
categorical_features = ames.select_dtypes(include=['object']).columns
X = ames[numeric_features.tolist() + categorical_features.tolist()]


# Target variable
y = ames['SalePrice']

In [26]:
# Define the ordinal_categories dictionary
ordinal_categories = {
    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],
    'LandSlope': ['Sev', 'Mod', 'Gtl'],
    ('ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual'): ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ('BsmtQual', 'BsmtCond'): ['No Bsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtExposure': ['No Bsmt', 'No', 'Mn', 'Av', 'Gd'],
    ('BsmtFinType1', 'BsmtFinType2'): ['No Bsmt', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'Electrical': ['FuseP', 'FuseF', 'FuseA', 'SBrkr'],
    'FireplaceQu': ['No Fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageFinish': ['No Garage', 'Unf', 'RFn', 'Fin'],
    ('GarageQual', 'GarageCond'): ['No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y'],
    'PoolQC': ['No Pool', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fence': ['No Fence', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
}


# Function to extract categories for each feature
def get_categories_dict(ordinal_categories):
    categories_dict = {}
    for key, value in ordinal_categories.items():
        if isinstance(key, tuple):
            for sub_key in key:
                categories_dict[sub_key] = value
        else:
            categories_dict[key] = value
    return categories_dict

# Extract categories for each feature
categories_dict = get_categories_dict(ordinal_categories)

# Separate feature names and their corresponding categories
feature_names = list(categories_dict.keys())
categories = [categories_dict[feature] for feature in feature_names]

# Define transformers for numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
ordinal_categorical_features = feature_names

non_ordinal_categorical_features = [feature for feature in categorical_features if feature not in ordinal_categorical_features]

#ordinal_categorical_features
#non_ordinal_categorical_features

In [27]:
ordinal_encoder = OrdinalEncoder(categories=categories)

categorical_transformer = OneHotEncoder(drop=None, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])

# Perform cross-validation and store results in a dictionary
cv_results = {}
scores = cross_val_score(pipeline, X, y)
cv_results = round(scores.mean(), 4)
# Output the mean cross-validation scores
print(cv_results)

#an untuned RandomForestRegressor has an R2 of .688

0.9024


^^ Encode within pipeline

Encode outside of pipeline

In [96]:
X2 = X.copy()

ordinal_categories = {
    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],
    'LandSlope': ['Sev', 'Mod', 'Gtl'],
    ('ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual'): ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ('BsmtQual', 'BsmtCond'): ['No Bsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtExposure': ['No Bsmt', 'No', 'Mn', 'Av', 'Gd'],
    ('BsmtFinType1', 'BsmtFinType2'): ['No Bsmt', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'Electrical': ['FuseP', 'FuseF', 'FuseA', 'SBrkr'],
    'FireplaceQu': ['No Fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageFinish': ['No Garage', 'Unf', 'RFn', 'Fin'],
    ('GarageQual', 'GarageCond'): ['No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y'],
    'PoolQC': ['No Pool', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fence': ['No Fence', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
}

#categorical_features is a list of the categorical variables

ordinal_keys = set()
for key in ordinal_categories.keys():
    if isinstance(key, tuple):
        ordinal_keys.update(key)
    else:
        ordinal_keys.add(key)

filtered_categorical_features = [feature for feature in categorical_features if feature not in ordinal_keys]

#print("Filtered categorical features:", filtered_categorical_features)

# Apply OrdinalEncoder based on the specified categories
for features, categories in ordinal_categories.items():
    if isinstance(features, tuple):
        for feat in features:
            X2[feat] = OrdinalEncoder(categories=[categories]).fit_transform(X2[[feat]])
    else:
        # Apply encoding to a single column
        X2[features] = OrdinalEncoder(categories=[categories]).fit_transform(X2[[features]])

In [101]:
non_ordinal_categorical_features

['MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LandContour',
 'Utilities',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'Foundation',
 'Heating',
 'CentralAir',
 'Functional',
 'GarageType',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [102]:
# Combine preprocessing steps
categorical_transformer = OneHotEncoder(drop=None, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

In [103]:
# Perform cross-validation and store results in a dictionary
cv_results = {}
scores = cross_val_score(pipeline, X2, y)
cv_results = round(scores.mean(), 4)
# Output the mean cross-validation scores
print(cv_results)

#an untuned RandomForestRegressor has an R2 of .688

0.9019
