In [2]:
from cleaning import data_new, X_train, X_test, y_train, y_test

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
# Define numeric and categorical features
numeric_features = X.select_dtypes(include='number').columns
categorical_features = X.select_dtypes(include='object').columns

# Define preprocessors
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define a Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Define Recursive Feature Elimination (RFE) with backward selection
rfe = RFE(estimator=model, n_features_to_select=None, step=1)  # Start with all features

# Create a pipeline with the preprocessor, RFE, and Logistic Regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rfe', rfe),
    ('clf', model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy and classification report
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

# Get the selected features from the RFE
selected_features = X_train.columns[pipeline.named_steps['rfe'].support_]
print("Selected Features:", selected_features)




Accuracy: 0.6487603305785123
              precision    recall  f1-score   support

           0       0.68      0.71      0.69       136
           1       0.60      0.58      0.59       106

    accuracy                           0.65       242
   macro avg       0.64      0.64      0.64       242
weighted avg       0.65      0.65      0.65       242



IndexError: boolean index did not match indexed array along dimension 0; dimension is 223 but corresponding boolean dimension is 692

In [5]:
# Check the shape of the data and mask
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of RFE mask: {pipeline.named_steps['rfe'].support_.shape}")

# Ensure the transformed data matches the shape of RFE support mask
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
print(f"Shape of X_train after transformation: {X_train_transformed.shape}")

# Get the selected features using the support mask from RFE
# Note: The RFE mask is applied to the transformed data, not the original data
selected_indices = pipeline.named_steps['rfe'].support_

# Since RFE is applied to transformed data, get the feature names from the transformed data
# If OneHotEncoder was used, the feature names could have been expanded
transformed_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(input_features=categorical_features)

# Combine numeric and categorical feature names
numeric_feature_names = numeric_features.tolist()
all_feature_names = numeric_feature_names + list(transformed_feature_names)

# Filter feature names based on the RFE mask
selected_features = [name for name, selected in zip(all_feature_names, selected_indices) if selected]

print("Selected Features:", selected_features)


Shape of X_train: (965, 223)
Shape of RFE mask: (692,)
Shape of X_train after transformation: (965, 692)




ValueError: input_features should have length equal to number of features (222), got 223