# Intro

This notebook was created for learning purpose

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

Dataset source: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
dataset = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
dataset.head()

# EDA

In [None]:
dataset.describe()

In [None]:
dataset['diagnosis'].describe()

In [None]:
get_corr = dataset.drop(["id", "Unnamed: 32"], axis=1).corr("spearman")

plt.figure(figsize=(20,8))
sns.heatmap(get_corr, annot=True, cmap='BrBG')
plt.show()

# Preprocessing

## Check for null values

In [None]:
cols_null = pd.DataFrame(dataset.isnull().sum())
cols_null = cols_null.reset_index().rename(columns={'index': 'Title', 0: 'Total'})
cols_null.loc[:, 'Percentage'] = cols_null.apply(lambda row: row['Total']/dataset.shape[0] * 100, axis=1)
cols_null

In [None]:
del cols_null

## Check for duplicated values

In [None]:
dataset[dataset.duplicated()]

## Label encoding

In [None]:
# use it later
# from sklearn.preprocessing import OrdinalEncoder

# # define ordinal encoding
# encoder = OrdinalEncoder()
# # transform data
# result = encoder.fit_transform(dataset[['diagnosis']])
# encoder.inverse_transform(result)

## Check for imbalanced dataset 

In [None]:
dataset[['diagnosis']].value_counts()

In [None]:
dataset[['diagnosis']].value_counts()/len(dataset[['diagnosis']].index) * 100

## TTS

In [None]:
X = dataset.drop(["id", "diagnosis", "Unnamed: 32"],1)
y = dataset['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 123)

## Standardization

In [None]:
import time
start_time = time.time()

for x in X:
    dataset[x] = StandardScaler().fit_transform(dataset[x].values.reshape(len(dataset), 1))

print("--- %s seconds ---" % (time.time() - start_time))
dataset.head()

# Handle Imbalanced data

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
oversample = SMOTE(random_state=1)
undersample = RandomUnderSampler(sampling_strategy='majority', random_state=1)

# print("origin")
# print(X_train.shape, y_train.shape)
# print(y_train.value_counts())
# print("oversample")
X_train_o, y_train_o = oversample.fit_resample(X_train, y_train)
# print(X_train_o.shape, y_train_o.shape)
# print(y_train_o.value_counts())
# print("undersample")
X_train_u, y_train_u = undersample.fit_resample(X_train, y_train)
# print(X_train_u.shape, y_train_o.shape)
# print(y_train_u.value_counts())

data_train = {"origin":{"X_train":X_train, "y_train": y_train}, 
                   "oversample":{"X_train":X_train_o, "y_train": y_train_o},
                  "undersample":{"X_train":X_train_u, "y_train": y_train_u}}

# Modeling

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Function to evaluate model

In [None]:
# ## function to evaluate model
from sklearn.metrics import accuracy_score
def eval_model(model,X_test,y_test, X_train, y_train):
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    return pd.DataFrame({"test score":[accuracy_score(y_test,y_pred)], "train score":[accuracy_score(y_train,y_train_pred)]})

from sklearn.model_selection import cross_val_score
def get_cross_val(model, X_train, y_train):
    accuracy = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = 10)
#     print(accuracy)
    #get the mean of each fold 
    return accuracy.mean()

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
# data_train
for dt in data_train:
    score = get_cross_val(SVC(),data_train[dt]['X_train'], data_train[dt]['y_train'])
    print(dt, score)

In [None]:
pipeline_svc = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', SVC())])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_svc,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# pipeline_svc.fit(X_train_o, y_train_o)
pipeline_svc.fit(data_train[highest_score_dt]['X_train'], data_train[highest_score_dt]['y_train'])
eval_model(pipeline_svc,X_test,y_test, data_train[highest_score_dt]['X_train'], data_train[highest_score_dt]['y_train'])

In [None]:
y_pred_train = pipeline_svc.predict(X_test)
y_pred_train

In [None]:
y_pred = pipeline_svc.predict(X_test)
y_pred

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
pipeline_LogRes = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', LogisticRegression())])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_LogRes,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# pipeline_LogRes = Pipeline(steps=[('smote', SMOTE(random_state=1)),("stdScaller", StandardScaler()), ('model', LogisticRegression())], verbose=True)
# pipeline_LogRes.fit(X_train, y_train)
# eval_model(pipeline_LogRes,X_test,y_test, X_train, y_train)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
pipeline_KNN = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', KNeighborsClassifier())])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_KNN,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# # ('over', RandomOverSampler(random_state=1)),('under', RandomUnderSampler(random_state=1)),('smote', SMOTE(random_state=1)),
# pipeline_KNN = Pipeline(steps=[('over', RandomOverSampler(random_state=1)), ("stdScaller", StandardScaler()), ('model', KNeighborsClassifier())])
# pipeline_KNN.fit(X_train, y_train)
# eval_model(pipeline_KNN,X_test,y_test, X_train, y_train)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
pipeline_tree = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', DecisionTreeClassifier())])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_tree,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# pipeline_tree = Pipeline(steps=[("stdScaller", StandardScaler()),('smote', SMOTE()),('over', RandomOverSampler()),('under', RandomUnderSampler()), ('model', DecisionTreeClassifier())])
# pipeline_tree.fit(X_train, y_train)
# eval_model(pipeline_tree,X_test,y_test, X_train, y_train)

## Ensemble Method - Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
list_model = [("svc", pipeline_svc), ("lr", pipeline_LogRes), ("knn", pipeline_KNN), ('dtree', pipeline_tree)]
pipeline_svc = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', VotingClassifier(list_model))])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_svc,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# from sklearn.ensemble import VotingClassifier

# list_model = [("svc", pipeline_svc), ("lr", pipeline_LogRes), ("knn", pipeline_KNN), ('dtree', pipeline_tree)]
# voting = VotingClassifier(list_model)
# voting.fit(X_train, y_train)
# eval_model(voting,X_test,y_test, X_train, y_train)

## Ensemble Method - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
pipeline_bagging = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', BaggingClassifier())])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_bagging,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# from sklearn.ensemble import BaggingClassifier

# pipeline_bagging = Pipeline(steps=[("StdScaler", StandardScaler()), ("smote", SMOTE()), ("under", RandomUnderSampler()), ("over", RandomOverSampler()), ("model", BaggingClassifier())])
# pipeline_bagging.fit(X_train, y_train)
# eval_model(pipeline_bagging,X_test,y_test, X_train, y_train)

## Ensemble Method - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
pipeline_rf = Pipeline(steps=[("stdScaller", StandardScaler()), ('model', RandomForestClassifier())])
highest_score = 0
highest_score_dt = ""
for dt in data_train:
#     print(dt)
    score = get_cross_val(pipeline_rf,data_train[dt]['X_train'], data_train[dt]['y_train'])
    if (score > highest_score):
        highest_score = score
        highest_score_dt = dt
#     print(score)
# print()
print("highest score:",highest_score_dt, highest_score)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# pipeline_rf = Pipeline(steps=[("StdScaler", StandardScaler()), ("smote", SMOTE()), ("under", RandomUnderSampler()), ("over", RandomOverSampler()), ("model", RandomForestClassifier())])
# pipeline_rf.fit(X_train, y_train)
# eval_model(pipeline_rf,X_test,y_test, X_train, y_train)

## Tunning Hyperparameter

I chose Logistic Regression because it has the highest score on training and testing

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'model__penalty' : ['l1', 'l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
             'model__max_iter':[4000]}
grid = GridSearchCV(estimator=pipeline_LogRes, param_grid=param_grid, scoring = 'accuracy',n_jobs = 2,cv=3)
grid.fit(X_train, y_train)

grid.best_params_


## Final Model

need to recreate model using pipeline sklearn

In [None]:
voting = VotingClassifier(list_model, voting="hard")
voting.fit(X_train, y_train)
eval_model(voting,X_test,y_test, X_train, y_train)


In [None]:
voting.predict(X_test)