In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel

# Load your dataset
data = pd.read_csv("dataset.csv")

# Separate the target variable (e.g., 'target_column') from the features
X = data.drop(columns=['target'])
y = data['target']

# Step 1: Automated Feature Selection
# Use a feature selection method, e.g., SelectFromModel with a tree-based model.
# This will select the most important features.
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100))
X_selected = feature_selector.fit_transform(X, y)

# Step 2: Numerical Pipeline
numerical_features = X_selected.select_dtypes(include=['number']).columns.tolist()

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with column mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

# Step 3: Categorical Pipeline
categorical_features = X_selected.select_dtypes(include=['object']).columns.tolist()

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('encoder', OneHotEncoder())  # One-hot encode categorical features
])

# Step 4: Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Step 5: Final Model Building
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))  # You can adjust hyperparameters here
])

# Step 6: Train-Test Split and Model Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)

print(f"Accuracy on the test dataset: {accuracy}")


AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'

In [10]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
