# Importing required libraries

In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, recall_score
from sklearn.metrics import precision_score,classification_report,accuracy_score
from sklearn import metrics

# Loading Data

In [3]:
df = pd.DataFrame(load_breast_cancer()['data'],columns=load_breast_cancer()['feature_names'])
df['y'] = load_breast_cancer()['target']
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
df.shape

(569, 31)

In [48]:
print("Count of Zero's in label are",len(df[df.y==0]))
print("Count of one's in label are",len(df[df.y==1]))

Count of Zero's in label are 212
Count of one's in label are 357


# Check for missing values

In [5]:
df.isna().any()

mean radius                False
mean texture               False
mean perimeter             False
mean area                  False
mean smoothness            False
mean compactness           False
mean concavity             False
mean concave points        False
mean symmetry              False
mean fractal dimension     False
radius error               False
texture error              False
perimeter error            False
area error                 False
smoothness error           False
compactness error          False
concavity error            False
concave points error       False
symmetry error             False
fractal dimension error    False
worst radius               False
worst texture              False
worst perimeter            False
worst area                 False
worst smoothness           False
worst compactness          False
worst concavity            False
worst concave points       False
worst symmetry             False
worst fractal dimension    False
y         

# Information about all the variables

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

# Feature Selection

In [7]:
X = df.iloc[:, :-1] # Features
y = df.iloc[:, -1] # Target variable

# Spliting into train and test 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state = 1)

# Decision tree

In [9]:
d_clf = DecisionTreeClassifier()
d_clf.fit(X_train,y_train)

DecisionTreeClassifier()

#### Model performance

In [10]:
dy_pred = d_clf.predict(X_test)
print("confustion matrix\n",confusion_matrix(y_test, dy_pred))
print("\nAccuracy score = ",accuracy_score(y_test, dy_pred))
print("Recall score = ",recall_score(y_test, dy_pred))
print("Precision score = ",precision_score(y_test, dy_pred))
print("F1 score = ",f1_score(y_test, dy_pred))

confustion matrix
 [[39  3]
 [ 2 70]]

Accuracy score =  0.956140350877193
Recall score =  0.9722222222222222
Precision score =  0.958904109589041
F1 score =  0.9655172413793104


# Random forest

In [14]:
r_clf = RandomForestClassifier()
r_clf.fit(X_train,y_train)

RandomForestClassifier()

#### Model performance

In [15]:
ry_pred = r_clf.predict(X_test)
print("confustion matrix \n",confusion_matrix(y_test, ry_pred))
print("\nAccuracy score = ",accuracy_score(y_test, ry_pred))
print("Recall score = ",recall_score(y_test, ry_pred))
print("Precision score = ",precision_score(y_test, ry_pred))
print("F1 score = ",f1_score(y_test, ry_pred))

confustion matrix 
 [[37  5]
 [ 1 71]]

Accuracy score =  0.9473684210526315
Recall score =  0.9861111111111112
Precision score =  0.9342105263157895
F1 score =  0.9594594594594595


# Gradient Boosting

In [16]:
gradient_booster = GradientBoostingClassifier()
gradient_booster.fit(X_train,y_train)

GradientBoostingClassifier()

#### Model performance

In [17]:
gy_pred = gradient_booster.predict(X_test)
print("\n confusion matrix \n",confusion_matrix(y_test, gy_pred))
print("\nAccuracy score = ",accuracy_score(y_test, gy_pred))
print("Recall score = ",recall_score(y_test, gy_pred))
print("Precision score = ",precision_score(y_test, gy_pred))
print("F1 score = ",f1_score(y_test, gy_pred))
#print(classification_report(y_test,gradient_booster.predict(X_test)))


 confusion matrix 
 [[38  4]
 [ 0 72]]

Accuracy score =  0.9649122807017544
Recall score =  1.0
Precision score =  0.9473684210526315
F1 score =  0.972972972972973


# K-Nearest Neighbour

In [18]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

#### Model performance

In [19]:
ky_pred = knn.predict(X_test)
print("confustion matrix \n",confusion_matrix(y_test, ky_pred))
print("\nAccuracy score = ",accuracy_score(y_test, ky_pred))
print("Recall score = ",recall_score(y_test, ky_pred))
print("Precision score = ",precision_score(y_test, ky_pred))
print("F1 score = ",f1_score(y_test, ky_pred))

confustion matrix 
 [[37  5]
 [ 2 70]]

Accuracy score =  0.9385964912280702
Recall score =  0.9722222222222222
Precision score =  0.9333333333333333
F1 score =  0.9523809523809524


# Hyper-parameters of the Decision Tree model

In [20]:
clf = DecisionTreeClassifier()
tuned_parameters = {"criterion": ["gini", "entropy"], "max_depth": [10, 20, 50, 100],"min_samples_leaf":[10, 20, 50]}
cv_grid = GridSearchCV(clf, param_grid = tuned_parameters, scoring = 'roc_auc', verbose = 5)
cv_grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] criterion=gini, max_depth=10, min_samples_leaf=10 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=10, score=0.972, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=10 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=10, score=0.945, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=10 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=10, score=0.972, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=10 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=10, score=0.980, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=10 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=10, score=0.969, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=20 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=20, score=0.940, total=   0.0s
[CV] cri

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s



[CV] criterion=gini, max_depth=10, min_samples_leaf=20 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=20, score=0.988, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=20 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=20, score=0.980, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=50 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=50, score=0.946, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=50 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=50, score=0.956, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=50 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=50, score=0.980, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=50 ...............
[CV]  criterion=gini, max_depth=10, min_samples_leaf=50, score=0.981, total=   0.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=50 ..............

[CV]  criterion=entropy, max_depth=10, min_samples_leaf=10, score=0.953, total=   0.0s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=10 ............
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=10, score=0.980, total=   0.0s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=10 ............
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=10, score=0.965, total=   0.0s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=10 ............
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=10, score=0.968, total=   0.0s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=20 ............
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=20, score=0.931, total=   0.0s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=20 ............
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=20, score=0.948, total=   0.0s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=20 ............
[CV]  criterion=entropy, max_depth=10, min_samples_l

[CV]  criterion=entropy, max_depth=100, min_samples_leaf=50, score=0.984, total=   0.0s
[CV] criterion=entropy, max_depth=100, min_samples_leaf=50 ...........
[CV]  criterion=entropy, max_depth=100, min_samples_leaf=50, score=0.976, total=   0.0s
[CV] criterion=entropy, max_depth=100, min_samples_leaf=50 ...........
[CV]  criterion=entropy, max_depth=100, min_samples_leaf=50, score=0.972, total=   0.0s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    2.0s finished


GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 50, 100],
                         'min_samples_leaf': [10, 20, 50]},
             scoring='roc_auc', verbose=5)

#### Model performance

In [21]:
y_pred = cv_grid.predict(X_test)
print("confustion matrix \n",confusion_matrix(y_test, y_pred))
print("\nAccuracy score = ",accuracy_score(y_test, y_pred))
print("Recall score = ",recall_score(y_test, y_pred))
print("Precision score = ",precision_score(y_test, y_pred))
print("F1 score = ",f1_score(y_test, y_pred))

confustion matrix 
 [[29 13]
 [ 3 69]]

Accuracy score =  0.8596491228070176
Recall score =  0.9583333333333334
Precision score =  0.8414634146341463
F1 score =  0.8961038961038961


# Comparing the results of various models

In [22]:
models_scores_table = pd.DataFrame({  'Decision Tree':[ accuracy_score(y_test, dy_pred),
                                                        recall_score(y_test, dy_pred),
                                                        precision_score(y_test, dy_pred),
                                                        f1_score(y_test, dy_pred)],
                                       
                                      'Random Forest':[ accuracy_score(y_test,ry_pred),
                                                        recall_score(y_test, ry_pred),
                                                        precision_score(y_test, ry_pred),
                                                        f1_score(y_test, ry_pred)],
                                       
                                      'Gradient Boosting':[ accuracy_score(y_test, gy_pred),
                                                        recall_score(y_test, gy_pred),
                                                        precision_score(y_test, gy_pred),
                                                        f1_score(y_test, gy_pred)],
                                      'KNN':[ accuracy_score(y_test,ky_pred),
                                                        recall_score(y_test, ky_pred),
                                                        precision_score(y_test, ky_pred),
                                                        f1_score(y_test, ky_pred)],
                                      'Decision Tree(tuned)':[ accuracy_score(y_test,y_pred),
                                                        recall_score(y_test, y_pred),
                                                        precision_score(y_test, y_pred),
                                                        f1_score(y_test, y_pred)]},                                      
                                   index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
models_scores_table.head()

Unnamed: 0,Decision Tree,Random Forest,Gradient Boosting,KNN,Decision Tree(tuned)
Accuracy,0.95614,0.947368,0.964912,0.938596,0.859649
Precision,0.972222,0.986111,1.0,0.972222,0.958333
Recall,0.958904,0.934211,0.947368,0.933333,0.841463
F1 Score,0.965517,0.959459,0.972973,0.952381,0.896104


From the above data frame, we can observe the 'Accuracy', 'Precision', 'Recall', and 'F1 Score'  of different classification models. Almost all classification models are giving good results but Gradient boosting is outperformed.