In [56]:
# Use an automated feature select#on method to #dent#fy the #mportant features #n the dataset
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, f_classif


data = load_breast_cancer()
X = data.data
y = data.target


selector = SelectKBest(score_func=f_classif, k=10) 
X_new = selector.fit_transform(X, y)


selected_feature_names = [data.feature_names[i] for i in selector.get_support(indices=True)]
print(selected_feature_names)


['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']


In [None]:


from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

X = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

y = pd.Series(cancer.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [50]:
# Numerical pipeline

# For the numerical columns, create a pipeline that first imputes the missing values using the mean of the column 
# values, and then scales the numerical columns using standardisation. 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


In [51]:
# Categorical pipeline

# For the categorical columns,  create a pipeline that first imputes the missing values using the most frequent value of
#  the column, and then one-hot encodes the categorical columns. 

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([    ('imputer', SimpleImputer(strategy='most_frequent')),    ('encoder', OneHotEncoder(handle_unknown='ignore'))])


In [52]:
# Combine numerical and categorical pipelines using ColumnTransformer

#  use a ColumnTransformer to apply the numerical pipeline to the numerical columns, and the categorical pipeline 
# to the categorical columns. 
cat_cols = list(df.select_dtypes(include=['object']))
num_cols = list(df.select_dtypes(exclude=['object']))

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([    ('num', num_pipeline, num_cols),    ('cat', cat_pipeline, cat_cols)])


In [53]:
# Random Forest Classifier

# Now you can use the preprocessor pipeline and a Random Forest Classifier to build the final model. 

from sklearn.ensemble import RandomForestClassifier

rf_classifier = Pipeline([    ('preprocessor', preprocessor),    ('classifier', RandomForestClassifier())])


In [54]:
# Evaluate the model

# Finally, you can evaluate the accuracy of the model on the test dataset using cross-validation and/or other evaluation
#  metrics such as accuracy, precision, recall, or F1 score. 

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_classifier, X_test, y_test, cv=5, scoring='accuracy')
print(f"Accuracy: {scores.mean():.4f} ")


Accuracy: 0.9413 


In [55]:
# Q2. Bu#ld a p#pel#ne that #ncludes a random forest class#f#er and a log#st#c regress#on class#f#er, and then
# use a vot#ng class#f#er to comb#ne the#r pred#ct#ons. Tra#n the p#pel#ne on the #r#s dataset and evaluate #ts
# accuracy.




from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, [0, 1, 2, 3]),
    ('cat', cat_pipeline, [])
])

rf_classifier = RandomForestClassifier(random_state=42)
lr_classifier = LogisticRegression(random_state=42)

voting_classifier = VotingClassifier(
    estimators=[('rf', rf_classifier), ('lr', lr_classifier)],
    voting='hard'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', voting_classifier)
])

pipeline.fit(X_train, y_train)


accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 1.0000
