In [125]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.validation import check_array


np.random.seed(0)

In [126]:
X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)

In [127]:
import numpy as np
import pandas as pd
import scipy.stats as ss

def cramers_v(X, y):
    """
    Calculate Cramer's V statistic for categorical-categorical association
    with correction from Bergsma and Wicher.
    
    Parameters:
    - X: 2D numpy array or pandas DataFrame, the feature matrix
    - y: 1D numpy array or pandas Series, the target variable
    
    Returns:
    - float, Cramer's V statistic
    """
    X = check_array(X, accept_sparse="csr", dtype=(np.float64, np.float32))


    scores = []
    for col in X:
        cm_ = pd.crosstab(X[col], y).to_numpy()
        chi2 = ss.chi2_contingency(cm_)[0]
        n = cm_.sum()
        phi2 = chi2 / n
        r, k = cm_.shape
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        score = np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
        scores.append(score)
    
    return np.array(scores)


In [128]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(cramers_v, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [129]:
scores = categorical_transformer.fit(X[categorical_features], y)
scores.named_steps['selector'].scores_


IndexError: Indexing with sparse matrices is not supported except boolean indexing where matrix and index are equal shapes.

In [None]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

In [None]:
clf.named_steps

{'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer',
                                                   SimpleImputer(strategy='median')),
                                                  ('scaler', StandardScaler())]),
                                  ['age', 'fare']),
                                 ('cat',
                                  Pipeline(steps=[('encoder',
                                                   OneHotEncoder(handle_unknown='ignore')),
                                                  ('selector',
                                                   SelectPercentile(percentile=50,
                                                                    score_func=<function chi2 at 0x0000013120897380>))]),
                                  ['embarked', 'sex', 'pclass'])]),
 'classifier': LogisticRegression()}

In [None]:
clf.named_steps['preprocessor']