In [None]:
#Q1.

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectFromModel

# Load your dataset (replace 'your_dataset.csv' with your dataset's filename)
data = pd.read_csv('your_dataset.csv')

# Split the data into features (X) and the target variable (y)
X = data.drop('target_column', axis=1)
y = data['target_column']

# Step 1: Feature Selection
# You can use SelectFromModel to select important features using a random forest classifier.
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_selected = feature_selector.fit_transform(X, y)

# Step 2: Numerical Pipeline
numerical_features = X_selected.select_dtypes(include=['int64', 'float64']).columns
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Step 3: Categorical Pipeline
categorical_features = X_selected.select_dtypes(include=['object']).columns
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Step 4: Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Step 5: Final Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Step 7: Train and evaluate the model
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
print(f'Model Accuracy: {accuracy:.2f}')

# Interpretation: The pipeline performs feature selection, numerical and categorical preprocessing, and model building.

#Interpretation:

 #   In Step 1, we use a Random Forest Classifier to select important features.
 #   Step 2 and Step 3 handle missing values and preprocessing for numerical and categorical features, respectively.
 #   Step 4 combines these preprocessing steps.
 #   Step 5 splits the dataset into training and testing sets.
  #  Step 6 trains the Random Forest Classifier and evaluates its accuracy on the test data.

#Possible Improvements:

 #   Hyperparameter tuning for the Random Forest Classifier.
#    Cross-validation for more robust model evaluation.
#    Experiment with other feature selection methods.
#    Explore additional data preprocessing steps if needed (e.g., feature engineering, outlier detection).

In [3]:
#Q2.

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Define the individual classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
lr_classifier = LogisticRegression(max_iter=1000)

# Step 3: Create a list of (classifier name, classifier) tuples
classifiers = [('Random Forest', rf_classifier), ('Logistic Regression', lr_classifier)]

# Step 4: Create a Voting Classifier that combines the individual classifiers
voting_classifier = VotingClassifier(estimators=classifiers, voting='hard')

# Step 5: Build a pipeline that includes the Voting Classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # You can include preprocessing steps here if needed
    ('voting', voting_classifier)
])

# Step 6: Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Step 7: Make predictions and evaluate accuracy on the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 1.00
Model Accuracy: 1.00
