In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tpot import TPOTClassifier
import joblib  # For saving the model
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data
df=pd.read_csv(r"C:\Users\Habibur Rahaman\Downloads\data.csv")

# Define column names (adjust if needed)
columns = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
           'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
           'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
           'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
           'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
           'fractal_dimension_se', 'radius_worst', 'texture_worst',
           'perimeter_worst', 'area_worst', 'smoothness_worst',
           'compactness_worst', 'concavity_worst', 'concave points_worst',
           'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']

df.columns = columns

# Drop unnecessary columns
df = df.drop(['id', 'Unnamed: 32'], axis=1)

# Handle missing values
#df.fillna(df.mean(), inplace=True)

# Encode categorical variables
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Define features and target
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# AutoML with TPOT
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=0)
tpot.fit(X_train, y_train)

# Predictions
y_pred = tpot.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

# Save the best pipeline
tpot.export('best_pipeline.py')

# Save the trained model
joblib.dump(tpot.fitted_pipeline_, 'best_model.pkl')


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9692307692307693

Generation 2 - Current best internal CV score: 0.9692307692307693

Generation 3 - Current best internal CV score: 0.9802197802197803

Generation 4 - Current best internal CV score: 0.9802197802197803

Generation 5 - Current best internal CV score: 0.9802197802197803

Best pipeline: RandomForestClassifier(GradientBoostingClassifier(input_matrix, learning_rate=0.5, max_depth=7, max_features=0.45, min_samples_leaf=11, min_samples_split=14, n_estimators=100, subsample=0.7000000000000001), bootstrap=False, criterion=entropy, max_features=0.9500000000000001, min_samples_leaf=9, min_samples_split=18, n_estimators=100)
Accuracy: 0.9649
Confusion Matrix:
[[65  2]
 [ 2 45]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        67
           1       0.96      0.96      0.96        47

    accuracy                           0.96       114
   macro avg      



['best_model.pkl']

In [5]:
pip install jupyterlab_widgets



