In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import chi2
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import set_config

set_config(display="diagram")

In [None]:
from sklearn.compose import ColumnTransformer

class InvertableColumnTransformer(ColumnTransformer):
    """
    Adds an inverse transform method to the standard sklearn.compose.ColumnTransformer.

    Warning this is flaky and use at your own risk.  Validation checks that the column count in 
    `transformers` are in your object `X` to be inverted.  Reordering of columns will break things!
    """
    def inverse_transform(self, X):
        if isinstance(X,pd.DataFrame):
            X = X.to_numpy()

        arrays = []
        for name, indices in self.output_indices_.items():
            transformer = self.named_transformers_.get(name, None)
            arr = X[:, indices.start: indices.stop]

            if transformer in (None, "passthrough", "drop"):
                pass

            else:
                arr = transformer.inverse_transform(arr)

            arrays.append(arr)

        retarr = np.concatenate(arrays, axis=1)

        if retarr.shape[1] != X.shape[1]:
            raise ValueError(f"Received {X.shape[1]} columns but transformer expected {retarr.shape[1]}")

        return retarr

In [None]:
df = pd.read_csv('../../mapped_df.csv', index_col=0)

In [None]:
df.info()

In [None]:
X = df[
    [
        "purchaser_type",
        "loan_type",
        "loan_purpose",
        "lien_status",
        "open-end_line_of_credit",
        "business_or_commercial_purpose",
        "loan_amount",
        "loan_to_value_ratio",
        "loan_term",
        "interest_only_payment",
        "balloon_payment",
        "property_value",
        "construction_method",
        "occupancy_type",
        "total_units",
        "income",
        "debt_to_income_ratio",
        "applicant_ethnicity-1",
        "applicant_race-1",
        "applicant_sex",
        "applicant_age",
    ]
]
y = df["action_taken"]

In [None]:
categorical_columns = X.select_dtypes(include=[bool, object]).columns
numerical_columns = X.select_dtypes(include=[int, float]).columns

print(categorical_columns)
print(numerical_columns)

print(len(categorical_columns)+len(numerical_columns), X.shape)

In [None]:
preprocess = InvertableColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output=False), categorical_columns),
    ('scaler', StandardScaler(), numerical_columns),
], remainder = 'drop')
preprocess.fit(X)

In [None]:
X_transformed = pd.DataFrame(preprocess.fit_transform(X), columns = preprocess.get_feature_names_out())

In [None]:
preprocess.inverse_transform(X)

In [None]:
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=1000)),
])
pipeline.fit(X, y)
print(pipeline.score(X,y))

In [None]:
preprocess.named_transformers_