Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#set column names
column_name = ['id', 'clump_thickness', 'size_uniformity', 'shape_uniformity', 
            'marginal_adhesion', 'epithelial_size', 'bare_nucleoli', 'bland_chromatin'
             , 'normal_nucleoli', 'mitoses', 'target']

df=pd.read_csv('C:/Users/windows10/Desktop/breast-cancer-wisconsin.csv',names=column_name)
print('Shape before dropna: ',df.shape)
#df.replace({"?":np.nan},inplace=True)
#df.dropna(axis=0,inplace=True)
for c in column_name:
  df=df[pd.to_numeric(df[c],errors='coerce').notnull()]
print('Shape after dropna: ',df.shape)
df.head()

Shape before dropna:  (699, 11)
Shape after dropna:  (683, 11)


Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,target
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Dataset Scaling

In [2]:
def sscaler(data):
    scaler_train =  data.drop(['target','id'], axis=1)
    std = StandardScaler()
    Stand_scale = std.fit_transform(scaler_train)
    
    Stand_scale = pd.DataFrame(Stand_scale, columns = scaler_train.columns)

    return Stand_scale

Decision Tree

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def D_tree(X_train, X_test, y_train, y_test, kf, mode):
    
    criterion='gini'
    if mode==0:
      #Using Gini 
      gini = DecisionTreeClassifier(criterion = criterion, random_state = 1, max_depth =2, min_samples_leaf = 5)
      gini.fit(X_train, y_train)
    
      y_pred = gini.predict(X_test)
    
      gini_accuracy = accuracy_score(y_pred, y_test)
      gini_training = accuracy_score(gini.predict(X_train), y_train)
    
      print('test_accuracy(gini):', accuracy_score(y_pred, y_test))
      print('training_accuracy(gini):', accuracy_score(gini.predict(X_train), y_train))
      print()
    
    else:
      #Using Entropy
      criterion='entropy'
      entropy = DecisionTreeClassifier(criterion = criterion, random_state = 1, max_depth=5, min_samples_leaf = 5)
      entropy.fit(X_train, y_train)

      y_pred = entropy.predict(X_test)
    
      entropy_accuracy = accuracy_score(y_pred, y_test)
      entropy_training = accuracy_score(entropy.predict(X_train), y_train)
    
      print('test_accuracy(entropy):', accuracy_score(y_pred, y_test))
      print('training_accuracy(entropy):', accuracy_score(entropy.predict(X_train), y_train))
      print()
     
    from sklearn.model_selection import GridSearchCV
    
    param_grid = {
            "max_depth": [2, 3, 4, 5, 6, 7, 10],
            'max_features': [None, 'sqrt', 'log2', 3],
            'min_samples_leaf': [1, 2, 3],
            "min_samples_split": [2, 3, 4, 5, 6, 10]
            }
    
    grid_dtree = GridSearchCV(DecisionTreeClassifier(criterion=criterion,random_state=42),
                              param_grid=param_grid,cv=kf)
    
    grid_dtree.fit(X_train, y_train)
    
    best_params = grid_dtree.best_params_
    best_score = round(grid_dtree.best_score_, 4)
    return best_params, best_score

SVM

In [None]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def svm(X_train, X_test, y_train, y_test, kf):
    
    #svm hyperparameter
    #C: penalty for misclassified data
    #gamma: controls the distance of influence of a single training point
    #kernel: default='rbf'
      
    param_grid = {
            "C": [10,1,0.1,0.01],
            'gamma':[10,1,0.1,0.01,0.001],
            'kernel': ['linear','poly','rbf','sigmoid']
            }
    grid_svm = GridSearchCV(SVC(random_state=42),
                              param_grid=param_grid,cv=kf)
    
    grid_svm.fit(X_train, y_train)
    
    best_params = grid_svm.best_params_               
    best_score = round(grid_svm.best_score_, 4) 
    
    return best_params, best_score

Logistic Regression

In [5]:
def LogisticRegressor(X_train, X_test, y_train, y_test,kf):
    #reg = LogisticRegression(solver = 'lbfgs', max_iter=1000)
    #reg.fit(X_train, y_train)
    #train_score = reg.score(X_train, y_train)
    #test_score = reg.score(X_test, y_test)
    #cv_scores = cross_val_score(reg, X_test, y_test, cv=5)
    param_grid={
        #"penalty":['l1','l2','elasticnet','none'],
        "C":[10,1,0.1,0.01,0.001],
        "solver": ['newton-cg','lbfgs','liblinear','sag','saga'],
        "max_iter":[1000,1500,2000]
        }
    grid_logistic=GridSearchCV(LogisticRegression(),
                            param_grid=param_grid,cv=kf)
    grid_logistic.fit(X_train, y_train)
    best_params=grid_logistic.best_params_
    best_score=round(grid_logistic.best_score_,4)
    return best_params, best_score

Run models

In [6]:
from sklearn.model_selection import KFold
def run_model(df, n_split, model,scale):
    #if scale parameter is 1, scale the dataset
    if scale==1:
        X=sscaler(df)
    else:
        X=df.drop(columns=['target','id'])
    y=df['target']

    kf=KFold(n_splits=n_split,shuffle=True,random_state=10)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,train_size=0.7)
    
    # Decision Tree with Gini
    if model==1:
        best_param, best_score=D_tree(X_train, X_test, y_train, y_test,kf,0)
    
    # Decision Tree with Entrophy
    elif model==2:
        best_param, best_score=D_tree(X_train, X_test, y_train, y_test,kf,1)
    
    #SVM
    elif model==3:
        best_param, best_score=svm(X_train, X_test, y_train, y_test,kf)
  
    #Logistic Regression
    else:
        best_param, best_score=LogisticRegressor(X_train, X_test, y_train, y_test,kf)
    print('best param: {}\nbest score: {}\n'.format(best_param, best_score))
    return best_param, best_score

Giant function

In [7]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

#model parameter: 
# 1: Decision Tree with Gini
# 2: Decision Tree with Entropy
# 3: SVM
# 4: Logistic Regression

model_list=['Decision Tree with Gini','Decision Tree with Entropy','SVM','Logistic Regression']
for s in (0,1):
    if s==1:
        print('Dataset is scaled\n')
    for model in (1,2,3,4):
        print('Model: {}\n'.format(model_list[model-1]))
        for k in (3,5,7):
            print('k value:',k)
            run_model(df, k, model,s)


Model: Decision Tree with Gini

k value: 3
test_accuracy(gini): 0.9414634146341463
training_accuracy(gini): 0.9665271966527197

best param: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 10}
best score: 0.9708

k value: 5
test_accuracy(gini): 0.9414634146341463
training_accuracy(gini): 0.9560669456066946

best param: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 10}
best score: 0.9602

k value: 7
test_accuracy(gini): 0.9317073170731708
training_accuracy(gini): 0.9497907949790795

best param: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
best score: 0.9518

Model: Decision Tree with Entropy

k value: 3
test_accuracy(entropy): 0.9560975609756097
training_accuracy(entropy): 0.9728033472803347

best param: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
best score: 0.9373

k value: 5
test_accuracy(entropy): 0.9512195121951219
trainin