<a href="https://colab.research.google.com/github/lamyeamaha/CSE475_Breast_Cancer_Detection/blob/main/Breast_Cancer_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

**Loading Dataset:**

In [None]:
dataset=pd.read_csv("/content/drive/MyDrive/Dataset/breast-cancer.data")

**Adding attribute's names:**

In [None]:
dataset.columns=['class','age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
print(dataset.head())

                  class    age menopause tumor-size inv-nodes node-caps  \
0  no-recurrence-events  40-49   premeno      20-24       0-2        no   
1  no-recurrence-events  40-49   premeno      20-24       0-2        no   
2  no-recurrence-events  60-69      ge40      15-19       0-2        no   
3  no-recurrence-events  40-49   premeno        0-4       0-2        no   
4  no-recurrence-events  60-69      ge40      15-19       0-2        no   

   deg-malig breast breast-quad irradiat  
0          2  right    right_up       no  
1          2   left    left_low       no  
2          2  right     left_up       no  
3          2  right   right_low       no  
4          2   left    left_low       no  


**Replacing missing values:**

In [None]:
dataset["node-caps"].replace({"?":""}, inplace=True)
dataset["breast-quad"].replace({"?":""}, inplace=True)

**Data Encoding:**

In [None]:
label_encoder = preprocessing.LabelEncoder()

dataset['class']= label_encoder.fit_transform(dataset['class'])
dataset['age']= label_encoder.fit_transform(dataset['age'])
dataset['menopause']= label_encoder.fit_transform(dataset['menopause'])
dataset['tumor-size']= label_encoder.fit_transform(dataset['tumor-size'])
dataset['inv-nodes']= label_encoder.fit_transform(dataset['inv-nodes'])
dataset['node-caps']= label_encoder.fit_transform(dataset['node-caps'])
dataset['breast-quad']= label_encoder.fit_transform(dataset['breast-quad'])
dataset['breast']= label_encoder.fit_transform(dataset['breast'])
dataset['irradiat']= label_encoder.fit_transform(dataset['irradiat'])

print(dataset.head())

   class  age  menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  \
0      0    2          2           3          0          1          2       1   
1      0    2          2           3          0          1          2       0   
2      0    4          0           2          0          1          2       1   
3      0    2          2           0          0          1          2       1   
4      0    4          0           2          0          1          2       0   

   breast-quad  irradiat  
0            5         0  
1            2         0  
2            3         0  
3            4         0  
4            2         0  


**Filling missing values:**

In [None]:
dataset.fillna(dataset.median(), inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import metrics

In [None]:
train, test = train_test_split(dataset, test_size=0.3)
train_features = train.iloc[:,1:10]
train_target = train["class"]
test_features = test.iloc[:,1:10]
test_target = test["class"]

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

train_features = std_scaler.fit_transform(train_features)

test_features = std_scaler.fit_transform(test_features)

**Hyperparamets tuning for Random Forest Classifier:**

In [None]:
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(train_features, train_target)


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [None]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 500}

**Random forest classifier:**

In [None]:
rfc = RandomForestClassifier(n_estimators=500,criterion='gini',max_depth= 4,max_features='auto')
rfc.fit(train_features,train_target)

predicted_target = rfc.predict(test_features)

print("Confusion matrix: \n")
print(confusion_matrix(test_target,predicted_target))
print("\nClassification report: \n")
print(classification_report(test_target,predicted_target))
print("Accuracy: {}".format(accuracy_score(test_target, predicted_target)))

Confusion matrix: 

[[60  5]
 [11 10]]

Classification report: 

              precision    recall  f1-score   support

           0       0.85      0.92      0.88        65
           1       0.67      0.48      0.56        21

    accuracy                           0.81        86
   macro avg       0.76      0.70      0.72        86
weighted avg       0.80      0.81      0.80        86

Accuracy: 0.813953488372093


**Decision Tree Classifier:**

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='entropy',max_depth= 4)
dtc = dtc.fit(train_features,train_target)
y_pred = dtc.predict(test_features)

print("Confusion matrix: \n")
print(confusion_matrix(test_target,y_pred))
print("\nClassification report: \n")
print(classification_report(test_target,y_pred))
print("Accuracy: {}".format(accuracy_score(test_target, y_pred)))

Confusion matrix: 

[[57  8]
 [11 10]]

Classification report: 

              precision    recall  f1-score   support

           0       0.84      0.88      0.86        65
           1       0.56      0.48      0.51        21

    accuracy                           0.78        86
   macro avg       0.70      0.68      0.68        86
weighted avg       0.77      0.78      0.77        86

Accuracy: 0.7790697674418605


**Support Vector Machine Classifier:**

In [None]:
from sklearn import svm

SVM = svm.SVC(kernel = 'rbf', C = 15, probability = True)
SVM.fit(train_features,train_target)
y_pred2 = SVM.predict(test_features)

print("Confusion matrix: \n")
print(confusion_matrix(test_target,y_pred2))
print("\nClassification report: \n")
print(classification_report(test_target,y_pred2))
print("Accuracy: {}".format(accuracy_score(test_target, y_pred2)))

Confusion matrix: 

[[53 12]
 [ 8 13]]

Classification report: 

              precision    recall  f1-score   support

           0       0.87      0.82      0.84        65
           1       0.52      0.62      0.57        21

    accuracy                           0.77        86
   macro avg       0.69      0.72      0.70        86
weighted avg       0.78      0.77      0.77        86

Accuracy: 0.7674418604651163


**Hyperparameter tuning for XGBoost**

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

clf = xgb.XGBClassifier()
parameters = {
    'n_estimators': [100, 250, 500],
    'max_depth': [6, 9, 12],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.9, 1.0],
}

grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(train_features, train_target)


GridSearchCV(cv=3, estimator=XGBClassifier(), n_jobs=4,
             param_grid={'colsample_bytree': [0.9, 1.0],
                         'max_depth': [6, 9, 12],
                         'n_estimators': [100, 250, 500],
                         'subsample': [0.9, 1.0]},
             scoring='neg_log_loss')

In [None]:
grid.best_params_

{'colsample_bytree': 0.9,
 'max_depth': 9,
 'n_estimators': 100,
 'subsample': 1.0}

**XGBoost (gradient-boosted) classifier:**

In [None]:
from xgboost import XGBClassifier

xgc=XGBClassifier(n_estimators=100,max_depth= 9,subsample=1.0,colsample_bytree=0.9)
xgc.fit(train_features,train_target)
y_pred3 = xgc.predict(test_features)

print("Confusion matrix: \n")
print(confusion_matrix(test_target,y_pred3))
print("\nClassification report: \n")
print(classification_report(test_target,y_pred3))
print("Accuracy: {}".format(accuracy_score(test_target, y_pred3)))

Confusion matrix: 

[[49 16]
 [ 7 14]]

Classification report: 

              precision    recall  f1-score   support

           0       0.88      0.75      0.81        65
           1       0.47      0.67      0.55        21

    accuracy                           0.73        86
   macro avg       0.67      0.71      0.68        86
weighted avg       0.78      0.73      0.75        86

Accuracy: 0.7325581395348837
