<a href="https://colab.research.google.com/github/juantijero/ML_HW/blob/main/HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# 1. Load the dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# 2. Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=69)

print("Data loaded and split successfully.")
print("Training features shape:", X_train.shape)
print("Validation features shape:", X_val.shape)

Data loaded and split successfully.
Training features shape: (455, 30)
Validation features shape: (114, 30)


In [18]:
# 1. Create the scaler object
scaler = StandardScaler()

# 2. Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# 3. Only transform the validation data
X_val_scaled = scaler.transform(X_val)

print("Data scaled successfully.")
print("Original mean of first feature:", X_train[:,0].mean())
print("Scaled mean of first feature:", X_train_scaled[:,0].mean())

Data scaled successfully.
Original mean of first feature: 14.024430769230769
Scaled mean of first feature: -1.3430038522059036e-15


In [19]:
# 1. Train and test LINEAR SVM
print("1. Linear SVM Results")
linear_svm = SVC(kernel='linear', random_state=42)
linear_svm.fit(X_train_scaled, y_train)
y_pred_linear = linear_svm.predict(X_val_scaled)
print(classification_report(y_val, y_pred_linear, target_names=cancer.target_names))

# 2. Train and test NON-LINEAR (RBF) SVM
print("2. Non-Linear (RBF) SVM Results")
rbf_svm = SVC(kernel='rbf', random_state=42)
rbf_svm.fit(X_train_scaled, y_train)
y_pred_rbf = rbf_svm.predict(X_val_scaled)
print(classification_report(y_val, y_pred_rbf, target_names=cancer.target_names))

# 3. Train and test Logistic Regression (for comparison)
print("3. Logistic Regression")
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_val_scaled)
print(classification_report(y_val, y_pred_log_reg, target_names=cancer.target_names))

1. Linear SVM Results
              precision    recall  f1-score   support

   malignant       0.98      0.94      0.96        54
      benign       0.95      0.98      0.97        60

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

2. Non-Linear (RBF) SVM Results
              precision    recall  f1-score   support

   malignant       0.98      0.94      0.96        54
      benign       0.95      0.98      0.97        60

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

3. Logistic Regression
              precision    recall  f1-score   support

   malignant       0.98      0.91      0.94        54
      benign       0.92      0.98      0.95        60

    accuracy                           0.95       114
   macro avg       0.95      0.95      0.95       1

In [20]:
# Experiment with different C and gamma
rbf_svm_tuned = SVC(kernel='rbf', C=10, gamma=0.1, random_state=69)

# Then just fit and predict as normal
rbf_svm_tuned.fit(X_train_scaled, y_train)
y_pred_tuned = rbf_svm_tuned.predict(X_val_scaled)
print(classification_report(y_val, y_pred_tuned, target_names=cancer.target_names))

              precision    recall  f1-score   support

   malignant       0.95      0.96      0.95        54
      benign       0.97      0.95      0.96        60

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



In [21]:
# Try a smaller C to increase regularization
rbf_svm_tuned = SVC(kernel='rbf', C=0.1, random_state=69)
rbf_svm_tuned.fit(X_train_scaled, y_train)
y_pred_tuned = rbf_svm_tuned.predict(X_val_scaled)
print("--- C=0.1 ---")
print(classification_report(y_val, y_pred_tuned, target_names=cancer.target_names))

# Try a smaller gamma to make the boundary smoother
rbf_svm_tuned = SVC(kernel='rbf', gamma=0.01, random_state=69)
rbf_svm_tuned.fit(X_train_scaled, y_train)
y_pred_tuned = rbf_svm_tuned.predict(X_val_scaled)
print("--- gamma=0.01 ---")
print(classification_report(y_val, y_pred_tuned, target_names=cancer.target_names))

--- C=0.1 ---
              precision    recall  f1-score   support

   malignant       0.94      0.87      0.90        54
      benign       0.89      0.95      0.92        60

    accuracy                           0.91       114
   macro avg       0.92      0.91      0.91       114
weighted avg       0.91      0.91      0.91       114

--- gamma=0.01 ---
              precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        54
      benign       0.94      0.98      0.96        60

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.96       114
weighted avg       0.96      0.96      0.96       114



In [22]:
from sklearn.model_selection import GridSearchCV

# 1. Define the parameter "grid" you want to search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# 2. Create the GridSearch object
# This will test an RBF SVM
# It will use 5-fold cross-validation (cv=5) to test all combinations
grid_search = GridSearchCV(
    SVC(kernel='rbf', random_state=69), # The model to tune
    param_grid,                         # The parameters to test
    cv=5,                               # Number of folds
    scoring='accuracy',                 # The metric to optimize
    verbose=1                           # Shows progress
)

# 3. Run the search
# This will take a minute or two to run
print("Running Grid Search...")
grid_search.fit(X_train_scaled, y_train)

# 4. Print the best results
print("\nGrid Search Complete.")
print("The best parameters found are:", grid_search.best_params_)
print(f"The best accuracy found was: {grid_search.best_score_:.4f}")

Running Grid Search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Grid Search Complete.
The best parameters found are: {'C': 10, 'gamma': 0.01}
The best accuracy found was: 0.9780


In [26]:
from sklearn.model_selection import GridSearchCV

# 1. Define the parameter "grid" you want to search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}

# 2. Create the GridSearch object
# This will test an RBF SVM
# It will use 5-fold cross-validation (cv=5) to test all combinations
grid_search = GridSearchCV(
    SVC(kernel='linear', random_state=69), # The model to tune
    param_grid,                         # The parameters to test
    cv=5,                               # Number of folds
    scoring='accuracy',                 # The metric to optimize
    verbose=1                           # Shows progress
)

# 3. Run the search
# This will take a minute or two to run
print("Running Grid Search...")
grid_search.fit(X_train_scaled, y_train)

# 4. Print the best results
print("\nGrid Search Complete.")
print("The best parameters found are:", grid_search.best_params_)
print(f"The best accuracy found was: {grid_search.best_score_:.4f}")

Running Grid Search...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Grid Search Complete.
The best parameters found are: {'C': 1, 'gamma': 0.001}
The best accuracy found was: 0.9758
