<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [56]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,f1_score, precision_score,recall_score,roc_auc_score,roc_curve
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
#Load the data and show the dataset 
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', 
    sep=',', 
    header=None)
df.columns=['Id','CumpThick','UnicellSize','UniCellShape','MargAdh','SingEpiCelSize','Bare Nuc','BlandChr','NormalNuc','Mitoses','Class']
print("Shape of the data:", df.shape)
df.head()

Shape of the data: (699, 11)


Unnamed: 0,Id,CumpThick,UnicellSize,UniCellShape,MargAdh,SingEpiCelSize,Bare Nuc,BlandChr,NormalNuc,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df['Bare Nuc'] = df['Bare Nuc'].str.replace('?', '1')
df['Bare Nuc'].value_counts()

  df['Bare Nuc'] = df['Bare Nuc'].str.replace('?', '1')


1     418
10    132
2      30
5      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuc, dtype: int64

In [5]:
# Convert the data type to int
df['Bare Nuc'] = df['Bare Nuc'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Id              699 non-null    int64
 1   CumpThick       699 non-null    int64
 2   UnicellSize     699 non-null    int64
 3   UniCellShape    699 non-null    int64
 4   MargAdh         699 non-null    int64
 5   SingEpiCelSize  699 non-null    int64
 6   Bare Nuc        699 non-null    int32
 7   BlandChr        699 non-null    int64
 8   NormalNuc       699 non-null    int64
 9   Mitoses         699 non-null    int64
 10  Class           699 non-null    int64
dtypes: int32(1), int64(10)
memory usage: 57.5 KB


In [6]:
# Split data into X and y
X = df.drop(['Id','Class'], axis=1)
y = df['Class']

In [7]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Normalization
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [9]:
# Create a model dictionary
models = {"Logistic Regression": LogisticRegression(),
       
          "Support Vector Machine": SVC(probability=True),
          "Decision Tree": DecisionTreeClassifier(),
          
         }

In [10]:
for name, model in models.items():
    model.fit(X_train, y_train)

In [11]:
# Accuracy
print("Accuracy")
for name, model in models.items():
    print(name + ": {:.3f}".format(accuracy_score(y_test, model.predict(X_test))))

Accuracy
Logistic Regression: 0.977
Support Vector Machine: 0.989
Decision Tree: 0.949


In [54]:
def BestClassifier (X,y,to_scale =None,t_encode=None):
    """"   
     Find the best combination of scaler, encoder, fitting algorithm
    print best score and best combination
    Parameters --------------------------------
    X: DataFrame of predictors 
    y: DataFrame of labels 
    to_scale: an option whether the data is to be scaled = True if t needs scaling ., None if there is no need of encoding 
    to_encode: an option whether the data is to be encoded if t needs ecncoding, None if there is no need of encoding 
    scalers: list of scalers
            None: [StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]
            if you want to scale other ways, then put the sclaer in list
    encoders: list of encoders
        None: [OrdinalEncoder(), OneHotEncoder(),SVC(),]
        if you want to use only one, put a encoder in list
    models: list of models
        None: [DecisionTreeClassifier(criterion='entropy'),DecisionTreeClassifier(criterion='gini'),LogisticRegression(),SVC()]
        if you want to fit other ways, then put the model in the list
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    scalers_to_test =[StandardScaler(),MinMaxScaler(),MaxAbsScaler(),RobustScaler()]
    encoders_o_est=[OrdinalEncoder(),OneHotEncoder()]
    seed=42
    pipe = Pipeline(
    steps = [("scaler", StandardScaler()),("clf",LogisticRegression())])
    params = [
   
    {"scaler":scalers_to_test,
     "clf": [DecisionTreeClassifier()],
      "clf__criterion": ["gini","entropy"],
      "clf__splitter": ["best","random"],
      "clf__random_state": [seed],},
     {"scaler":scalers_to_test,
      "clf": [LogisticRegression()],
      "clf__solver": ['newton-cg','lbfgs', 'sag', 'saga'],
      "clf__penalty": ['l2']},
     {"scaler":scalers_to_test,
      "clf": [SVC()],
      "clf__C": [0.001,0.01,0.1],
      "clf__kernel": ['linear','poly', 'rbf', 'sigmoid'],
      "clf__degree": [1,2,3],
      "clf__gamma": [1,0.1,0.10,0.01]}
      ]
      # Only bootstrap == True.
    
    # Defining StratifiedKFold object -> 10-StratifiedFolds
    kfold = KFold(n_splits=10,
                        random_state=seed, shuffle=True)
    best_model = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
    
    print("Best: %f using %s" %(best_model.best_score_, best_model.best_params_)
    )
    predict = best_model.predict(X_test)
    print("Accuracy of testing: ",
            accuracy_score(y_test, predict), "\n")
    print("Confusion Matrix:\n",
            confusion_matrix(y_test,predict), "\n")
    print("Classification report:\n",
            classification_report(y_test,predict))

In [57]:
BestClassifier(X,y, to_scale=True)

Fitting 5 folds for each of 608 candidates, totalling 3040 fits
Best: 0.967546 using {'clf': SVC(C=0.01, degree=1, gamma=1, kernel='sigmoid'), 'clf__C': 0.01, 'clf__degree': 1, 'clf__gamma': 1, 'clf__kernel': 'sigmoid', 'scaler': StandardScaler()}
Accuracy of testing:  0.9714285714285714 

Confusion Matrix:
 [[116   2]
 [  3  54]] 

Classification report:
               precision    recall  f1-score   support

           2       0.97      0.98      0.98       118
           4       0.96      0.95      0.96        57

    accuracy                           0.97       175
   macro avg       0.97      0.97      0.97       175
weighted avg       0.97      0.97      0.97       175

