# Creating Pipeline 

In [2]:
# importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [4]:
# reading data

df_heart = pd.read_csv('/home/karush/code/karushp/practice/heart-disease/data/heart-disease.csv')

# feature and target
X = df_heart.drop(columns='target')
y = df_heart['target']

In [9]:
# Identifing target distribution
y.value_counts(normalize=True)*100

1    54.455446
0    45.544554
Name: target, dtype: float64

In [5]:
# Split datat into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)


In [19]:
# Creaate a Pipeline
pipeline = Pipeline ([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest()),
    ('estimator', RandomForestClassifier(random_state=1))
])

In [26]:
# Define a parameter grid
param_grid = {
    'feature_selection__k': [2,3,4,5,6,7,8,9,10,11],
    'estimator__n_estimators': [10, 50,100],
    'estimator__max_depth': [None, 10, 20, 30],
}


In [27]:
# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [28]:
# Predict and evaluate
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Best parameters: {grid_search.best_params_}')
print(f'Accuracy : {accuracy:.2f}')

Best parameters: {'estimator__max_depth': None, 'estimator__n_estimators': 100, 'feature_selection__k': 10}
Accuracy : 0.74


In [25]:
# Identifying the selected features
selected_features = X_train.columns[grid_search.best_estimator_.named_steps['feature_selection'].get_support()]
print(f'Selected features: {selected_features}')

Selected features: Index(['age', 'sex', 'cp', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
       'ca', 'thal'],
      dtype='object')
