## Part 5 and 6

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('breast_cancer_data.csv')

df.drop(columns='id')

X = df.drop(columns='diagnosis')
y = df['diagnosis'] == 'M'

y.value_counts()

diagnosis
False    357
True     212
Name: count, dtype: int64

5. Use GridSearchCV to tune the parameter of each of the above models. Can
you obtain better results in this step for any of the models? Discuss your
observations.

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

Defining the parameter grids to tune each model:

In [6]:
from sklearn.model_selection import GridSearchCV

# Defining a dictionary with all the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SGD': SGDClassifier(),
    'SVM': SVC()
}

# Defining a parameter grid for each model
param_grid = {
    # Regularisation strength
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},

    # Max depth, associated with flexibility of the model
    'Decision Tree': {'max_depth': [None, 10, 20, 30, 40, 50]},

    # N_estimators (number of trees to use)
    'Random Forest': {'n_estimators': [10, 50, 100, 200]},

    # alpha: multiplier of the regularisaiton term
    'SGD': {'alpha': [0.0001, 0.001, 0.01, 0.1]},
    
    'SVM': {
        # Regularisation parameter
        'C': [0.01, 0.1, 1, 10, 100],
        # Kernel to use
        'kernel': ['linear', 'rbf']}
}

After having defined the parameter grid, we can run a loop across the models to tune each of them, and take a look at the resuls:

In [7]:
# Performing Grid Search for each model
for model_name, model in models.items():
    # Running the Grid Search (fitting it to the data)
    print(f"Working on model: {model_name}")
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Best parameters and best score
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best Score for {model_name}: {grid_search.best_score_}")

    # Evaluate on the test set
    y_pred = grid_search.predict(X_test)
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
    print(f"Accuracy for {model_name}: {accuracy_score(y_test, y_pred)}\n")




6. Randomly (or based on certain hypothesis) remove some features and
re-evaluate the models. Document your observations with respect to models
performances.