In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import chi2
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import set_config

set_config(display="diagram")

In [59]:
from sklearn.compose import ColumnTransformer

class InvertableColumnTransformer(ColumnTransformer):
    """
    Adds an inverse transform method to the standard sklearn.compose.ColumnTransformer.

    Warning this is flaky and use at your own risk.  Validation checks that the column count in 
    `transformers` are in your object `X` to be inverted.  Reordering of columns will break things!
    """
    def inverse_transform(self, X):
        if isinstance(X,pd.DataFrame):
            X = X.to_numpy()

        arrays = []
        for name, indices in self.output_indices_.items():
            transformer = self.named_transformers_.get(name, None)
            arr = X[:, indices.start: indices.stop]

            if transformer in (None, "passthrough", "drop"):
                pass

            else:
                arr = transformer.inverse_transform(arr)

            arrays.append(arr)

        retarr = np.concatenate(arrays, axis=1)

        if retarr.shape[1] != X.shape[1]:
            raise ValueError(f"Received {X.shape[1]} columns but transformer expected {retarr.shape[1]}")

        return retarr

In [33]:
df = pd.read_csv('../../mapped_df.csv', index_col=0)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 793441 entries, 0 to 793440
Data columns (total 25 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   county_code                     793441 non-null  float64
 1   conforming_loan_limit           793441 non-null  object 
 2   action_taken                    793441 non-null  bool   
 3   purchaser_type                  793441 non-null  object 
 4   loan_type                       793441 non-null  object 
 5   loan_purpose                    793441 non-null  object 
 6   lien_status                     793441 non-null  object 
 7   open-end_line_of_credit         793441 non-null  bool   
 8   business_or_commercial_purpose  793441 non-null  bool   
 9   loan_amount                     793441 non-null  float64
 10  loan_to_value_ratio             793441 non-null  float64
 11  loan_term                       793441 non-null  float64
 12  interest_only_payment

In [35]:
X = df[
    [
        "purchaser_type",
        "loan_type",
        "loan_purpose",
        "lien_status",
        "open-end_line_of_credit",
        "business_or_commercial_purpose",
        "loan_amount",
        "loan_to_value_ratio",
        "loan_term",
        "interest_only_payment",
        "balloon_payment",
        "property_value",
        "construction_method",
        "occupancy_type",
        "total_units",
        "income",
        "debt_to_income_ratio",
        "applicant_ethnicity-1",
        "applicant_race-1",
        "applicant_sex",
        "applicant_age",
    ]
]
y = df["action_taken"]

In [49]:
categorical_columns = X.select_dtypes(include=[bool, object]).columns
numerical_columns = X.select_dtypes(include=[int, float]).columns

print(categorical_columns)
print(numerical_columns)

print(len(categorical_columns)+len(numerical_columns), X.shape)

Index(['purchaser_type', 'loan_type', 'loan_purpose', 'lien_status',
       'open-end_line_of_credit', 'business_or_commercial_purpose',
       'interest_only_payment', 'balloon_payment', 'construction_method',
       'occupancy_type', 'applicant_ethnicity-1', 'applicant_race-1',
       'applicant_sex', 'applicant_age'],
      dtype='object')
Index(['loan_amount', 'loan_to_value_ratio', 'loan_term', 'property_value',
       'total_units', 'income', 'debt_to_income_ratio'],
      dtype='object')
21 (793441, 21)


In [66]:
preprocess = InvertableColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False), categorical_columns),
    ('scaler', StandardScaler(), numerical_columns),
], remainder = 'drop')
preprocess.fit(X)

In [67]:
X_transformed = pd.DataFrame(preprocess.fit_transform(X), columns = preprocess.get_feature_names_out())

In [69]:
preprocess.inverse_transform(X)

ValueError: could not convert string to float: 'NotApplicable'

In [37]:
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000)),
])
pipeline.fit(X, y)
print(pipeline.score(X,y))

0.8639545977583715


In [39]:
preprocess.named_transformers_

AttributeError: 'ColumnTransformer' object has no attribute 'inverse_transform'