# Breast Cancer Classification

## Attribute Information:

-  ID number 
- Diagnosis (M = malignant, B = benign)

### Ten real-valued features are computed for each cell nucleus:

- radius (mean of distances from center to points on the perimeter)
- texture (standard deviation of gray-scale values)
- perimeter
- area
- smoothness (local variation in radius lengths)
- compactness (perimeter^2 / area - 1.0)
- concavity (severity of concave portions of the contour)
- concave points (number of concave portions of the contour)
- symmetry
- fractal dimension ("coastline approximation" - 1)

##  Importing libraries

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

import warnings
warnings.filterwarnings('ignore')


plt.style.use('ggplot')

## Load the data

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df.head()

## Data Preprocessing

In [None]:
df.drop(['id', 'Unnamed: 32'], axis = 1, inplace = True)

In [None]:
df.diagnosis.unique()

In [None]:
df['diagnosis'] = df['diagnosis'].apply(lambda val: 1 if val == 'M' else 0)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# checking for null values

df.isna().sum()

In [None]:
# visualizing null values

msno.bar(df)

#### There are no missing values in the data.

## Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize = (20, 15))
plotnumber = 1

for column in df:
    if plotnumber <= 30:
        ax = plt.subplot(5, 6, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column)
        
    plotnumber += 1

plt.tight_layout()
plt.show()

In [None]:
# heatmap 

plt.figure(figsize = (20, 12))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask, linewidths = 1, annot = True, fmt = ".2f")
plt.show()

### We can see that there are many columns which are very highly correlated which causes multicollinearity so we have to remove highly correlated features.

In [None]:
# removing highly correlated features

corr_matrix = df.corr().abs() 

mask = np.triu(np.ones_like(corr_matrix, dtype = bool))
tri_df = corr_matrix.mask(mask)

to_drop = [x for x in tri_df.columns if any(tri_df[x] > 0.92)]

df = df.drop(to_drop, axis = 1)

print(f"The reduced dataframe has {df.shape[1]} columns.")

In [None]:
# creating features and label 

X = df.drop('diagnosis', axis = 1)
y = df['diagnosis']

In [None]:
# splitting data into training and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
# scaling data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [None]:
# fitting data to model

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
# model predictions

y_pred = log_reg.predict(X_test)

In [None]:
# accuracy score

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(accuracy_score(y_train, log_reg.predict(X_train)))

log_reg_acc = accuracy_score(y_test, log_reg.predict(X_test))
print(log_reg_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# K Neighbors Classifier (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
# model predictions 

y_pred = knn.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, knn.predict(X_train)))

knn_acc = accuracy_score(y_test, knn.predict(X_test))
print(knn_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Support Vector Classifier (SVC)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svc = SVC()
parameters = {
    'gamma' : [0.0001, 0.001, 0.01, 0.1],
    'C' : [0.01, 0.05, 0.5, 0.1, 1, 10, 15, 20]
}

grid_search = GridSearchCV(svc, parameters)
grid_search.fit(X_train, y_train)

In [None]:
# best parameters

grid_search.best_params_

In [None]:
# best accuracy 

grid_search.best_score_

In [None]:
svc = SVC(C = 10, gamma = 0.01)
svc.fit(X_train, y_train)

In [None]:
# model predictions 

y_pred = svc.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, svc.predict(X_train)))

svc_acc = accuracy_score(y_test, svc.predict(X_test))
print(svc_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
parameters = {
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'loss' : ['hinge', 'log'],
    'penalty' : ['l1', 'l2']
}

grid_search = GridSearchCV(sgd, parameters, cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
# best parameter 

grid_search.best_params_

In [None]:
sgd = SGDClassifier(alpha = 0.001, loss = 'log', penalty = 'l2')
sgd.fit(X_train, y_train)

In [None]:
# model predictions 

y_pred = sgd.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, sgd.predict(X_train)))

sgd_acc = accuracy_score(y_test, sgd.predict(X_test))
print(sgd_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : range(2, 32, 1),
    'min_samples_leaf' : range(1, 10, 1),
    'min_samples_split' : range(2, 10, 1),
    'splitter' : ['best', 'random']
}

grid_search_dt = GridSearchCV(dtc, parameters, cv = 5, n_jobs = -1, verbose = 1)
grid_search_dt.fit(X_train, y_train)

In [None]:
# best parameters

grid_search_dt.best_params_

In [None]:
# best score

grid_search_dt.best_score_

In [None]:
dtc = DecisionTreeClassifier(criterion = 'entropy', max_depth = 28, min_samples_leaf = 1, min_samples_split = 8, splitter = 'random')
dtc.fit(X_train, y_train)

In [None]:
y_pred = dtc.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, dtc.predict(X_train)))

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(dtc_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_clf = RandomForestClassifier(criterion = 'entropy', max_depth = 11, max_features = 'auto', min_samples_leaf = 2, min_samples_split = 3, n_estimators = 130)
rand_clf.fit(X_train, y_train)

In [None]:
y_pred = rand_clf.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, rand_clf.predict(X_train)))

ran_clf_acc = accuracy_score(y_test, y_pred)
print(ran_clf_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Logistic Regression', log_reg), ('K Nearest Neighbours', knn), ('Support Vector Classifier', svc),
               ('Decision Tree', dtc)]

vc = VotingClassifier(estimators = classifiers)

vc.fit(X_train, y_train)

In [None]:
y_pred = vc.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, vc.predict(X_train)))

vc_acc = accuracy_score(y_test, y_pred)
print(vc_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = dtc)

ada = AdaBoostClassifier(dtc, n_estimators = 180)
ada.fit(X_train, y_train)

In [None]:
y_pred = ada.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, ada.predict(X_train)))

ada_acc = accuracy_score(y_test, y_pred)
print(ada_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.001, 0.1, 1, 10],
    'n_estimators': [100, 150, 180, 200]
}

grid_search_gbc = GridSearchCV(gbc, parameters, cv = 5, n_jobs = -1, verbose = 1)
grid_search_gbc.fit(X_train, y_train)

In [None]:
# best parameters 

grid_search_gbc.best_params_

In [None]:
# best score

grid_search_gbc.best_score_

In [None]:
gbc = GradientBoostingClassifier(learning_rate = 1, loss = 'exponential', n_estimators = 200)
gbc.fit(X_train, y_train)

In [None]:
y_pred = gbc.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, gbc.predict(X_train)))

gbc_acc = accuracy_score(y_test, y_pred)
print(gbc_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Stochastic Gradient Boosting (SGB)

In [None]:
sgbc = GradientBoostingClassifier(max_depth=4, subsample=0.9, max_features=0.75, n_estimators=200, random_state=0)

sgbc.fit(X_train, y_train)

In [None]:
y_pred = sgbc.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, sgbc.predict(X_train)))

sgbc_acc = accuracy_score(y_test, y_pred)
print(sgbc_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

# Extreme Gradient Boosting

In [None]:
from xgboost import XGBClassifier 

xgb = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.5, max_depth = 5, n_estimators = 180)

xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
# accuracy score

print(accuracy_score(y_train, xgb.predict(X_train)))

xgb_acc = accuracy_score(y_test, y_pred)
print(xgb_acc)

In [None]:
# confusion matrix

print(confusion_matrix(y_test, y_pred))

In [None]:
# classification report

print(classification_report(y_test, y_pred))

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'KNN', 'SVC', 'SGD Classifier', 'Decision Tree Classifier', 'Random Forest Classifier', 'Voting Classifier', 'Ada Boost Classifier',
             'Gradient Boosting Classifier', 'Stochastic Gradient Boosting', 'XgBoost'],
    'Score': [log_reg_acc, knn_acc, svc_acc, sgd_acc, dtc_acc, ran_clf_acc, vc_acc, ada_acc, gbc_acc, sgbc_acc, xgb_acc]
})

models.sort_values(by = 'Score', ascending = False)

### Best model for diagnosing breast cancer is "Gradient Boosting Classifier" with an accuracy of 98.8%.