In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [29]:
# Create a sample dataset with numerical and categorical features
data = pd.DataFrame({
    'num_feature1': [1, 2, np.nan, 4, 5],
    'num_feature2': [10, 20, 30, 40, np.nan],
    'cat_feature1': ['A', 'B', 'A', 'B', 'C'],
    'cat_feature2': ['X', 'Y', np.nan, 'X', 'Z']
})

target = pd.Series([1, 0, 1, 0, 1])  # Binary classification


In [30]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)


In [31]:
# Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize the numerical features
])

# Categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])


In [33]:
# Define the numerical and categorical features
numerical_features = ['num_feature1', 'num_feature2']
categorical_features = ['cat_feature1', 'cat_feature2']

# Combine numerical and categorical pipelines using ColumnTransformer
preprocessing = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Build the final pipeline with feature selection and Random Forest classifier
final_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('feature_selection', SelectKBest(score_func=f_classif, k='all')),  # Adjust 'k' if needed
    ('classifier', RandomForestClassifier(random_state=42))
])


In [34]:
# Train the model
final_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.0


  f = msb / msw
