#Here's a tabular comparison of TPOT and Random Forest:


| Feature               | TPOT (Tree-based Pipeline Optimization Tool) | Random Forest                            |
|-----------------------|----------------------------------------------|-----------------------------------------|
| **Purpose**           | Automated machine learning (AutoML) tool that optimizes pipelines | Ensemble learning algorithm combining multiple decision trees |
| **Functionality**     | Constructs and optimizes pipelines, searches for models, preprocessing, and hyperparameters | Builds multiple decision trees and combines their outputs through averaging or voting |
| **Output**            | Produces a complete optimized pipeline, including preprocessing and model selection | Produces an ensemble model for classification or regression |
| **Usage**             | Automates model selection and tuning, ideal for users wanting quick results | Robust, interpretable model for classification or regression tasks |
| **Complexity**        | More complex, requiring setup and computational resources for optimization | Simpler to use with a well-defined algorithmic approach |
| **Model Variety**     | Can include various models (including Random Forest) as part of the optimization process | A specific model type focusing on decision trees |
| **Export Capability** | Can export the best pipeline to a Python file | Does not provide a complete pipeline, focuses on model predictions |
| **Overfitting Handling** | Uses various techniques to reduce overfitting during pipeline optimization | Reduces overfitting by averaging outputs from multiple trees |


# **Installing the Requirements**

In [1]:
!pip install ucimlrepo
!pip install tpot
!pip install streamlit


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_

# **Loading the Iris Dataset from UCI Machine Learning Repository**

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# metadata
print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)


# **Preparing Features and Target Variables for the Iris Dataset**

In [24]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tpot import TPOTClassifier
import pandas as pd
data=pd.read_csv('/content/breast-cancer.csv')

"""X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets"""

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# **Data Preprocessing: Preparing the Iris Dataset for Modeling**

In [25]:
y.dtypes

dtype('O')

In [26]:
y = y.squeeze()  # Convert DataFrame to Series if y is a single column


In [27]:
if y.dtype == 'object':  # Check if target is categorical
  le = LabelEncoder()
  y = le.fit_transform(y)
elif y.dtype == 'bool':  # Convert boolean target to integer (0, 1)
  y = y.astype(int)

In [28]:
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()

# **Building a Machine Learning Pipeline for the Iris Dataset**

In [29]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Preprocessing for categorical data (impute with most frequent and one-hot encode)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for boolean data (impute with most frequent and encode as integer)
boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('boolean_to_int', OneHotEncoder(drop='if_binary'))  # Convert True/False to 0/1
])

# Bundle preprocessing for numeric, categorical, and boolean data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('bool', boolean_transformer, boolean_cols)
    ])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [31]:
preprocessor

In [32]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', TPOTClassifier(verbosity=2, generations=5, population_size=20, random_state=42))
])

In [33]:
model_pipeline

In [34]:
model_pipeline.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9647784810126583

Generation 2 - Current best internal CV score: 0.9647784810126583

Generation 3 - Current best internal CV score: 0.9672468354430379

Generation 4 - Current best internal CV score: 0.9672468354430379

Generation 5 - Current best internal CV score: 0.9748101265822784

Best pipeline: GradientBoostingClassifier(LinearSVC(SelectFwe(MinMaxScaler(input_matrix), alpha=0.048), C=1.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.0001), learning_rate=0.5, max_depth=2, max_features=0.05, min_samples_leaf=3, min_samples_split=16, n_estimators=100, subsample=1.0)


In [35]:
accuracy = model_pipeline.score(X_test, y_test)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 0.9766081871345029


In [36]:
tpot_classifier = model_pipeline.named_steps['classifier']
print(tpot_classifier.fitted_pipeline_)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('selectfwe', SelectFwe(alpha=0.048)),
                ('stackingestimator',
                 StackingEstimator(estimator=LinearSVC(dual=False, penalty='l1',
                                                       random_state=42))),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(learning_rate=0.5, max_depth=2,
                                            max_features=0.05,
                                            min_samples_leaf=3,
                                            min_samples_split=16,
                                            random_state=42))])


In [37]:
if isinstance(tpot_classifier, TPOTClassifier):
    tpot_classifier.export('best_model_pipeline.py')
    print("Best model pipeline exported to 'best_model_pipeline.py'.")
else:
    print("The model is not a TPOTClassifier.")

Best model pipeline exported to 'best_model_pipeline.py'.


# **Training and Evaluating a Random Forest Classifier Pipeline on the Iris Dataset**

In [39]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

data=pd.read_csv('/content/breast-cancer.csv')
X=data.drop('diagnosis',axis=1)
y=data['diagnosis']
"""X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets"""



# Automatically identify numeric, categorical, and boolean columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()
X[boolean_cols] = X[boolean_cols].astype(int)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing for numeric data (impute with median)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Preprocessing for categorical data (impute with most frequent and one-hot encode, with sparse_output=False)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Bundle preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a pipeline with preprocessing and RandomForestClassifier
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model pipeline
model_pipeline.fit(X_train, y_train)

# Evaluate the model
accuracy = model_pipeline.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2%}")


Model Accuracy: 97.66%
