# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost
import lightgbm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../content/wisconsin_data.csv')
df.head(5)

In [None]:
df.drop(['id', 'Unnamed: 32'], axis = 1, inplace = True)
df.diagnosis.unique()

In [None]:
df['diagnosis'] = df['diagnosis'].apply(lambda val: 1 if val == 'M' else 0)
df.head(5)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
plt.figure(figsize = (20, 12))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask, linewidths = 1, annot = True, fmt = ".2f")
plt.show()

In [None]:
# Getting Mean Columns with diagnosis
m_col = ['diagnosis','radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

# Getting Se Columns with diagnosis
s_col = ['diagnosis','radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se']

# Getting Worst column with diagnosis
w_col = ['diagnosis','radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

# pairplot for mean columns
sns.pairplot(df[m_col],hue = 'diagnosis', palette='Greens')

In [None]:
X = df.drop('diagnosis', axis = 1)
y = df['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion = 'entropy', max_depth = 28, min_samples_leaf = 1, min_samples_split = 8, splitter = 'random')
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [None]:
print("Train Accuracy:", accuracy_score(y_train, dtc.predict(X_train)))

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print("Test Accuracy:", dtc_acc)

# Random Forest

## 문제1. Random Forest를 이용하여 유방암 예측 모델을 만들고 학습 데이터와 테스트 데이터의 정확도를 출력하시오.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf =

#AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(dtc, n_estimators = 200)

ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)

In [None]:
print("Train Accuracy:",accuracy_score(y_train, ada.predict(X_train)))

ada_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:",ada_acc)

# GBM

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(learning_rate = 1, loss = 'exponential', n_estimators = 200)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

In [None]:
print("Train Accuracy:",accuracy_score(y_train, gbm.predict(X_train)))

gbm_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:",gbm_acc)

# XGBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators = 200)

xgb.fit(X_train, y_train, early_stopping_rounds=20, eval_set=[(X_test, y_test)])
y_pred = xgb.predict(X_test)

In [None]:
print("Train Accuracy:",accuracy_score(y_train, xgb.predict(X_train)))

xgb_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:",xgb_acc)

In [None]:
xgboost.plot_importance(xgb)

# LGBM

## 문제2. Lightgbm을 이용하여 유방암을 예측하는 학습 모델을 만들고 학습 데이터와 테스트 데이터의 정확도를 출력하시오.

In [None]:
from lightgbm import LGBMClassifier

lgbm =

# SVM

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [None]:
print("Train Accuracy:",accuracy_score(y_train, svc.predict(X_train)))

svc_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:",svc_acc)

# 성능 분석

In [None]:
models = pd.DataFrame({
    'Model': ['Decision Tree','Random Forest', 'AdaBoost', 'GBM','XGboost','LightGBM', 'SVM'],
    'Score': [dtc_acc, rf_acc, ada_acc, gbm_acc, xgb_acc, lgbm_acc, svc_acc]})
models.sort_values(by='Score', ascending=False)

# Grid Search CV

In [None]:
model = SVC()
param_grid = {
    'gamma' : [0.0001, 0.001, 0.01, 0.1],
    'C' : [0.01, 0.05, 0.5, 0.1, 1, 10, 15, 20]
}

In [None]:
gsc = GridSearchCV(model, param_grid, cv=10)
gsc.fit(X_train, y_train)

In [None]:
print("\n Best Score is ")
print(gsc.best_score_)

print("\n Best Estinator is ")
print(gsc.best_estimator_)

print("\n Best Parametes are")
print(gsc.best_params_)

In [None]:
svc_b = SVC(C = 10, gamma = 0.01)
svc_b.fit(X_train, y_train)

print("Train Accuracy:", accuracy_score(y_train, svc_b.predict(X_train)))
svc_acc = accuracy_score(y_test, svc_b.predict(X_test))
print("Test Accuracy:", svc_acc)

# Pipeline

In [None]:
model = xgboost.XGBClassifier()

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('pca', PCA()),
    ('model', model)
])

In [None]:
param_grid = {
    'pca__n_components': [5, 10, 15, 20, 25, 30],
    'model__max_depth': [2, 3, 5, 7, 10],
    'model__n_estimators': [10, 100, 500],
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time
grid.fit(X_train, y_train)

In [None]:
mean_score = grid.cv_results_["mean_test_score"][grid.best_index_]
std_score = grid.cv_results_["std_test_score"][grid.best_index_]

grid.best_params_, mean_score, std_score

print(f"Best parameters: {grid.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")