# DATA PREPARATION

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectorMixin

In [2]:
data_train = pd.read_csv("../Data/train.csv")
data_test = pd.read_csv("../Data/test.csv")

In [3]:
X, y = data_train.drop(columns="SalePrice"), data_train["SalePrice"].copy()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=42)
X_test = data_test.copy()

In [4]:
numerical_cols = X_train.drop(columns='Id').select_dtypes(exclude='object').columns
categorical_cols = X_train.select_dtypes(include='object').columns

In [5]:
all_categories = []
for col in categorical_cols:
    all_categories.append(X_train[col].dropna().unique())

In [6]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(categories='auto', handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
        ],
    remainder='passthrough'
    )

In [7]:
with open('./preprocessor.pkl', 'wb') as pkl_file:
    pickle.dump(preprocessor, pkl_file)

# Testing purposes

In [8]:
X_train_prepared = preprocessor.fit_transform(X_train.drop(columns='Id'))
X_valid_prepared = preprocessor.fit_transform(X_valid.drop(columns='Id'))
X_test_prepared = preprocessor.fit_transform(X_test.drop(columns='Id'))

In [9]:
def get_features_out(estimator, features_in):
    if hasattr(estimator, 'get_feature_names'):
        return estimator.get_feature_names(features_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(features_in)[estimator.get_support()]
    else:
        return features_in


def get_ct_feature_names(ct):
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name != 'remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_features_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_features_out(estimator, features)
            output_features.extend(features_out)

        # elif estimator == 'passthrough':
        #     print(name, estimator, features)
            # output_features.extend(ct._feature_names_in(features))
                
    return output_features

In [10]:
feature_names = get_ct_feature_names(preprocessor)

In [11]:
if isinstance(X_train_prepared, np.ndarray):
    X_train_prepared = pd.DataFrame(X_train_prepared, columns=feature_names, index=X_train["Id"])
    X_valid_prepared = pd.DataFrame(X_valid_prepared, columns=feature_names, index=X_valid["Id"])
    X_test_prepared = pd.DataFrame(X_test_prepared, columns=feature_names, index=X_test["Id"])
else:
    X_train_prepared = pd.DataFrame.sparse.from_spmatrix(X_train_prepared, columns=feature_names, index=X_train["Id"])
    X_valid_prepared = pd.DataFrame.sparse.from_spmatrix(X_valid_prepared, columns=feature_names, index=X_valid["Id"])
    X_test_prepared = pd.DataFrame.sparse.from_spmatrix(X_test_prepared, columns=feature_names, index=X_test["Id"])

ValueError: Column length mismatch: 270 vs. 284

In [71]:
X_train_prepared

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Con,SaleType_ConLD,SaleType_CWD,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_Partial,SaleCondition_Family,SaleCondition_AdjLand,SaleCondition_Alloca
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1024,1.475911,-0.409689,-0.683950,0.637073,-0.515364,1.107889,1.020374,-0.519303,-0.944261,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
811,-0.871228,0.576699,-0.054883,-0.094926,0.390453,0.094543,0.682585,-0.023289,0.469362,2.166141,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1385,-0.167086,0.069414,-0.152524,-0.094926,-0.515364,-1.049557,-1.681937,-0.601000,-0.533502,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
627,-0.871228,-1.621537,0.144198,-0.826925,-0.515364,-0.363097,-0.330782,-0.601000,-0.979219,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
814,-0.871228,0.492152,-0.090142,-0.094926,0.390453,-0.428474,-1.295893,0.817019,0.349193,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,-0.871228,0.576699,-0.129289,-0.094926,-0.515364,1.140578,1.020374,-0.601000,-0.926782,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1131,-0.167086,0.210326,-0.266078,-1.558925,-2.326999,-1.409132,-1.681937,-0.601000,0.379781,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1295,-0.871228,0.069414,-0.232808,-0.826925,1.296270,-0.526540,0.248285,-0.601000,-0.614343,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
861,-0.167086,-0.071499,-0.280725,0.637073,2.202087,-1.736018,0.634329,-0.601000,-0.979219,-0.284678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [72]:
X_train_prepared.to_csv('X_train_prepared.csv')
X_valid_prepared.to_csv('X_valid_prepared.csv')
X_test_prepared.to_csv('X_test_prepared.csv')