# Q1. You are work#ng on a mach#ne learn#ng project where you have a dataset conta#n#ng numer#cal and
# categor#cal features. You have #dent#f#ed that some of the features are h#ghly correlated and there are
# m#ss#ng values #n some of the columns. You want to bu#ld a p#pel#ne that automates the feature
# eng#neer#ng process and handles the m#ss#ng valuesD

# a pipeline in Python that automates the feature engineering process and handles missing values

1. Load the dataset and split it into training and testing sets.
2. Identify the numerical and categorical features in the dataset.
3. Handle missing values in the numerical features using mean imputation and in the categorical features using mode imputation.
4. Use an automated feature selection method, such as SelectKBest, to select the most important features in the dataset.
5. Scale the numerical features using StandardScaler to standardize the feature values.
6. One-hot encode the categorical features using OneHotEncoder to convert categorical variables into binary variables.
7. Merge the numerical and categorical features into a single feature matrix.
8. Train a machine learning model on the feature matrix and evaluate its performance on the testing set.

In [None]:
# Python code to implement this pipeline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical and categorical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]
cat_features = [i for i in range(X.shape[1]) if i not in num_features]

# Define the pipeline
pipeline = Pipeline([
    # Handle missing values in numerical features using mean imputation
    ('num_imputer', SimpleImputer(strategy='mean')),
    # Handle missing values in categorical features using mode imputation
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    # Use SelectKBest to select the most important features
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    # Scale the numerical features using StandardScaler
    ('scaler', StandardScaler()),
    # One-hot encode the categorical features using OneHotEncoder
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Transform the data using the fitted pipeline
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_transformed, y_test))

# This pipeline can be modified based on the specific needs of our machine learning project.


# Create a numerical pipeline that includes the following steps"

1. Load the dataset and split it into training and testing sets.
2. Identify the numerical features in the dataset.
3. Handle missing values in the numerical features using mean imputation.
4. Use an automated feature selection method, such as SelectKBest, to select the most important features in the dataset.
5. Scale the numerical features using StandardScaler to standardize the feature values.
6. Train a machine learning model on the selected features and evaluate its performance on the testing set.

In [None]:
# Python code to implement this numerical pipeline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]

# Define the numerical pipeline
num_pipeline = Pipeline([
    # Handle missing values in numerical features using mean imputation
    ('imputer', SimpleImputer(strategy='mean')),
    # Use SelectKBest to select the most important features
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    # Scale the numerical features using StandardScaler
    ('scaler', StandardScaler()),
])

# Fit the numerical pipeline on the training data
X_train_num = X_train[:, num_features]
num_pipeline.fit(X_train_num, y_train)

# Transform the numerical data using the fitted pipeline
X_train_num_transformed = num_pipeline.transform(X_train_num)
X_test_num_transformed = num_pipeline.transform(X_test[:, num_features])

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_num_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_num_transformed, y_test))


In [None]:
# Impute the missing values in the numerical columns using the mean of the column values

# pipeline that includes the steps to handle missing values in numerical columns using the mean of the column values:,

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical and categorical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]
cat_features = [i for i in range(X.shape[1]) if i not in num_features]

# Define the numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features),
])

# Fit the pipeline on the training data
preprocessor.fit(X_train)

# Transform the data using the fitted pipeline
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_transformed, y_test))



In this pipeline, we use the SimpleImputer class to impute the missing values in the numerical columns using the mean of the column values. We define separate pipelines for the numerical and categorical features and combine them using ColumnTransformer. The pipeline handles missing values and scales the numerical features using StandardScaler.

Note that we also handle missing values in the categorical features using the most frequent value imputation and use OneHotEncoder to convert the categorical features into numerical features.

You can modify this pipeline based on your specific needs by adding or removing preprocessing steps or using different imputation or scaling methods.

In [None]:
# Scale the numerical columns using standardisations

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical and categorical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]
cat_features = [i for i in range(X.shape[1]) if i not in num_features]

# Define the numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features),
])

# Fit the pipeline on the training data
preprocessor.fit(X_train)

# Transform the data using the fitted pipeline
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_transformed, y_test))


+ In this modified pipeline, we use the StandardScaler class to scale the numerical columns using standardisation after imputing the missing values using the mean of the column values. The rest of the pipeline remains the same as the previous example.

+ Note that standardisation scales the data to have zero mean and unit variance, which can help the model converge faster and improve its performance. You can modify this pipeline based on your specific needs by adding or removing preprocessing steps or using different scaling or imputation methods.

In [None]:
# Impute the missing values in the categorical columns using the most frequent value of the columns

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical and categorical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]
cat_features = [i for i in range(X.shape[1]) if i not in num_features]

# Define the numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features),
])

# Fit the pipeline on the training data
preprocessor.fit(X_train)

# Transform the data using the fitted pipeline
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_transformed, y_test))


+ In this modified pipeline, we use the SimpleImputer class with 'strategy='most_frequent' to impute the missing values in the categorical columns using the most frequent value of the column. The rest of the pipeline remains the same as the previous example.

+ Note that the choice of imputation strategy may depend on the nature of the missing values and the distribution of the data. You can modify this pipeline based on your specific needs by adding or removing preprocessing steps or using different imputation or encoding methods.

In [None]:
# One-hot encode the categorical columns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical and categorical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]
cat_features = [i for i in range(X.shape[1]) if i not in num_features]

# Define the numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features),
])

# Fit the pipeline on the training data
preprocessor.fit(X_train)

# Transform the data using the fitted pipeline
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_transformed, y_test))


+ In this modified pipeline, we use the OneHotEncoder class to one-hot encode the categorical columns. The 'handle_unknown='ignore' parameter is used to handle any unknown categories in the test data by ignoring them during encoding. The rest of the pipeline remains the same as the previous examples.

+ Note that the choice of encoding method may depend on the nature of the categorical data and the specific requirements of your machine learning model. You can modify this pipeline based on your specific needs by adding or removing preprocessing steps or using different encoding or imputation methods.

In [None]:
# Combine the numerical and categorical pipelines using a ColumnTransformers

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Load the dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify the numerical and categorical features
num_features = [i for i in range(X.shape[1]) if data.feature_names[i].startswith(('mean', 'area', 'worst'))]
cat_features = [i for i in range(X.shape[1]) if i not in num_features]

# Define the numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ])

# Fit the pipeline on the training data
preprocessor.fit(X_train)

# Transform the data using the fitted pipeline
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train a logistic regression model on the transformed data and evaluate its performance
model = LogisticRegression()
model.fit(X_train_transformed, y_train)
print('Accuracy on the testing set:', model.score(X_test_transformed, y_test))


+ In this updated pipeline, we define two separate pipelines: one for the numerical features and one for the categorical features. We then use a 'ColumnTransformer' to combine these two pipelines and apply them to the appropriate columns of the input data.

+ The 'ColumnTransformer' is defined with a list of tuples, where each tuple corresponds to a pipeline for a specific type of column (in this case, numerical or categorical). Each tuple includes the following three elements:

1. A string that identifies the pipeline (in this case, 'num' or 'cat').
2. The pipeline object for that type of column (in this case, 'num_pipeline' or 'cat_pipeline').
3. The indices of the columns to which the pipeline should be applied (in this case, 'num_features' or 'cat_features').


+ Once the 'ColumnTransformer' is defined, we fit it to the training data and use it to transform both the training and testing data. Finally, we train a logistic regression model on the transformed data and evaluate its performance.

+ This pipeline automates the feature engineering process by imputing missing values, scaling numerical features, one-hot encoding categorical features, and combining the transformed features into a single dataset. You can modify this pipeline to suit your specific needs by adding or removing preprocessing steps or using different encoding or imputation methods.

In [None]:
# Use a Random Forest Classifier to build the final models

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# load the dataset
data = pd.read_csv('your_dataset.csv')

# separate the target variable from the features
X = data.drop('target_variable', axis=1)
y = data['target_variable']

# divide the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define the numerical pipeline
num_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

# define the categorical pipeline
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

# combine the numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, ['numerical_feature_1', 'numerical_feature_2', ...]),
        ('cat', cat_pipeline, ['categorical_feature_1', 'categorical_feature_2', ...])
    ]
)

# define the final Random Forest Classifier model
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier())
    ]
)

# train the model on the training set
model.fit(X_train, y_train)

# make predictions on the test set
y_pred = model.predict(X_test)

# calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Model Accuracy:', accuracy)


In [None]:
# Evaluate the accuracy of the model on the test datasets

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# assume X and y are your feature and target variables
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define the numerical and categorical pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# combine the pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# define the final model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# make predictions on the test data and evaluate the model accuracy
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy:.2f}')


In [None]:
# Q2. Bu#ld a p#pel#ne that #ncludes a random forest class#f#er and a log#st#c regress#on class#f#er, and then
# use a vot#ng class#f#er to comb#ne the#r pred#ct#ons. Tra#n the p#pel#ne on the #r#s dataset and evaluate #ts accuracy.

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipelines for the Random Forest Classifier and the Logistic Regression Classifier
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(random_state=42))
])

# Combine the pipelines using a Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_pipeline), ('lr', lr_pipeline)],
    voting='hard'
)

# Train the pipeline on the training dataset
voting_clf.fit(X_train, y_train)

# Evaluate the accuracy of the pipeline on the test dataset
accuracy = voting_clf.score(X_test, y_test)
print('Accuracy: {:.2f}'.format(accuracy))
