## Problem
Accurate and early detection of breast cancer plays a vital role in improving patient outcomes and survival rates. However, existing detection methods often have limitations in terms of accuracy and efficiency. The aim of this project is to develop an advanced breast cancer detection system using Support Vector Machines (SVM) that can effectively classify breast tissue samples as malignant or benign, enabling timely intervention and improved patient care.

## Objective (s)
Develop a breast cancer detection system using Support Vector Machines (SVM) that can accurately classify breast tissue samples as malignant or benign.

In [2]:
# Load libraries
import numpy as np 
import pandas as pd
import researchpy as rp 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [26]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets 
  

In [27]:
X

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation


In [28]:
y

Unnamed: 0,NObeyesdad
0,Normal_Weight
1,Normal_Weight
2,Normal_Weight
3,Overweight_Level_I
4,Overweight_Level_II
...,...
2106,Obesity_Type_III
2107,Obesity_Type_III
2108,Obesity_Type_III
2109,Obesity_Type_III


## Exploring Data 

In [None]:
# check shape of data 
data.shape

In [None]:
# dtypes 
data.dtypes

In [None]:
# info 
data.info() 

In [None]:
# check missing data 
data.isnull().sum() 

## Descriptive statistics

In [None]:
# select numeric data 
num_cols = data.select_dtypes(exclude = 'object')
num_cols.head() 

In [None]:
num_cols.columns

In [None]:
# summary statistics of numerical variables 
rp.summary_cont(num_cols[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']])

In [None]:
# select categorical data 
cat_cols = data.select_dtypes(include = 'object')
cat_cols.head() 

In [None]:
cat_cols.columns

In [None]:
# summary statistics of categorical variables 
rp.summary_cat(cat_cols['diagnosis'])

## Correlations between Variables 

In [None]:
# correlation: Pearson’s by default 
data.corr(method='pearson')

## Skewness 

In [None]:
# skew 
data.skew() 

## Data visualizations

In [None]:
# Univariate distributions with histogram
data.select_dtypes(exclude = "object").hist(figsize=(20,10), edgecolor='black')
plt.show() 

In [None]:
# Univariate distributions with density plot 
data.select_dtypes(exclude = "object").plot(kind='density', subplots=True, sharex=False, figsize=(20,10), layout=(6,5))
plt.show() 

In [None]:
# Univariate distributions with box plots 
data.select_dtypes(exclude = "object").plot(kind='box', subplots=True, sharex=False, figsize=(20,10), layout=(6,5))
plt.show() 

In [None]:
# Multivariate plots with correlations 
plt.figure(figsize=(20,6))
corr = data.corr() 
sns.heatmap(corr, annot=True)
plt.show()

## Setup 

In [None]:
# exmine first few rows of data 
data.head() 

In [None]:
# import pycaret classification and init setup
from pycaret.classification import *
setup(data, target = 'diagnosis', session_id = 123)

## Compare Models 

In [None]:
# compare baseline models
best = compare_models()

## Create Model 

In [None]:
# create model 
et = create_model('et')

In [None]:
# print model parameters
print(et)

## Tune Model 

In [None]:
# tune hyperparameters of rf
tuned_et = tune_model(et)

In [None]:
# to access the tuner object you can set return_tuner = True
tuned_et, tuner = tune_model(et, return_tuner=True)

In [None]:
tuned_et

In [None]:
tuner

## Analyze Model 

In [None]:
# plot confusion matrix
plot_model(et, plot = 'confusion_matrix')

In [None]:
# plot AUC
plot_model(et, plot = 'auc')

In [None]:
# plot class report
plot_model(et, plot = 'class_report')

In [None]:
# plot feature importance
plot_model(et, plot = 'feature')

## Evaluate Model 

In [None]:
# evaluate model 
evaluate_model(et)

## Finalize Model 

In [None]:
# finalize a model
finalize_model(et)

## Prediction

In [None]:
# predict on test set
holdout_pred = predict_model(et)

In [None]:
# show predictions df
holdout_pred.head()

In [None]:
# copy data and drop Class variable
new_data = data.copy()
new_data.drop('diagnosis', axis=1, inplace=True)
new_data.head()

In [None]:
# predict model on new_data
predictions = predict_model(best, data = new_data)
predictions.head()

## Save Model 

In [None]:
# save pipeline
save_model(et, '../models/breast_cancer')

In [None]:
# load pipeline
loaded_best_pipeline = load_model('../models/breast_cancer')
loaded_best_pipeline