# Classification: Wisconsin Breast Cancer
- Dataset from UCI repository
- https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)

### Attribute Information:

1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

In [None]:
import pandas as pd
import numpy as np

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ['Code', 'ClumpThickness', 'CellSize', 'CellShape', 'Adhesion', 
    'SingleCellSize', 'BareNuclei', 'Chromatin', 'Nucleoli', 'Mitoses', 'Class']
cancer = pd.read_csv(url, names=names)

In [None]:
class_counts = cancer.groupby('Class').size()
print(class_counts)

In [None]:
print(cancer.dtypes)

In [None]:
# BareNuclei 데이터 타입을 숫자...같은걸로 변경
cancer['BareNuclei'] = cancer.BareNuclei.map({'1':1,'2':2,'3':3, '4':4,'5':5,'6':6,'7':7,'8':8,'9':9,'10':10, '?':np.nan})
print(cancer['BareNuclei'].head(20))

In [None]:
# NaN 수치를 적절한 수치로 대치 = 평균값으로 처리
cancer.BareNuclei.fillna(cancer.BareNuclei.mean(), inplace=True)
cancer.head(150)

In [None]:
cancer.corr(method = 'pearson')

In [None]:
cancer.corr().Class

### 지금까지 배운 모델을 사용해 유방암 여부를 예측하는 좋은 성능의 모델을 만들어보세요.

In [None]:
feature_cols = ['ClumpThickness', 'CellSize', 'CellShape', 'Adhesion',
            'SingleCellSize', 'BareNuclei', 'Chromatin', 'Nucleoli']
X = cancer[feature_cols]
cancer['Result'] = cancer.Class.map({2:0, 4:1})
Y = cancer.Result

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 123)
X_test.shape

In [None]:
# k-NN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
# 1 : acc= 0.97, sen= 0.94, spe = 0.98
# 3 : acc=0.97, sen = 0.95, spe = 0.98
# 5 : acc=0.98, sen = 0.98, spe = 0.98
# 7 : acc = 0.99, sen = 1.0, spe = 0.98 ** select
# 9 : acc = 0.98, sen = 0.98, spe = 0.97

In [None]:
knn.fit(X_train, Y_train)

In [None]:
Y_pred = knn.predict(X_test)
# calculate classification accuracy
from sklearn import metrics
accuracy = metrics.accuracy_score(Y_test, Y_pred)
cm = metrics.confusion_matrix(Y_test, Y_pred)

In [None]:
print(accuracy)
print(cm)
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
print('True Positives:', TP)
print('True Negatives:', TN)
print('False Positives:', FP)
print('False Negatives:', FN)
# calculate the sensitivity
print('Sensitivity:', TP / float(TP + FN))
# calculate the specificity
print('Specificity:', TN / float(TN + FP))

In [None]:
# Naiv Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, Y_train)
Y_pred_2 = gnb.predict(X_test)

In [None]:
accuracy2 = metrics.accuracy_score(Y_test, Y_pred_2)
cm2 = metrics.confusion_matrix(Y_test, Y_pred_2)
print(accuracy2)
print(cm2)
TP = cm2[1][1]
TN = cm2[0][0]
FP = cm2[0][1]
FN = cm2[1][0]
print('True Positives:', TP)
print('True Negatives:', TN)
print('False Positives:', FP)
print('False Negatives:', FN)
# calculate the sensitivity
print('Sensitivity:', TP / float(TP + FN))
# calculate the specificity
print('Specificity:', TN / float(TN + FP))

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train, Y_train)
print(list(zip(feature_cols, logreg.coef_[0])))

In [None]:
# class predictions (not predicted probabilities)
Y_pred_class = logreg.predict(X_test)

In [None]:
# calculate classification accuracy
from sklearn import metrics
print(metrics.accuracy_score(Y_test, Y_pred_class))

In [None]:
# print confusion matrix
print(metrics.confusion_matrix(Y_test, Y_pred_class))

In [None]:
# save confusion matrix and slice into four pieces
cm3 = metrics.confusion_matrix(Y_test, Y_pred_class)
TP = cm3[1][1]
TN = cm3[0][0]
FP = cm3[0][1]
FN = cm3[1][0]
print('True Positives:', TP)
print('True Negatives:', TN)
print('False Positives:', FP)
print('False Negatives:', FN)
# calculate the sensitivity
print('Sensitivity:', TP / float(TP + FN))
# calculate the specificity
print('Specificity:', TN / float(TN + FP))

In [None]:
# store the predicted probabilities
Y_pred_prob = logreg.predict_proba(X_test)[:, 1]

In [None]:
# histogram of predicted probabilities
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(Y_pred_prob)
plt.xlim(0, 1)
plt.xlabel('Predicted probability of Cancer')
plt.ylabel('Frequency')

In [None]:
# Sensitivity를 높이기 위해 threshold 조정...(???)
Y_pred_class_new = np.where(Y_pred_prob > 0.1, 1, 0)

In [None]:
print('accuracy : ', metrics.accuracy_score(Y_test, Y_pred_class_new))
cm4 = metrics.confusion_matrix(Y_test, Y_pred_class_new)
print(cm4)

In [None]:
TP = cm4[1][1]
TN = cm4[0][0]
FP = cm4[0][1]
FN = cm4[1][0]
# calculate the sensitivity
print('Sensitivity:', TP / float(TP + FN))
# calculate the specificity
print('Specificity:', TN / float(TN + FP))

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold

In [None]:
classifier = LogisticRegression(penalty='l1', C=0.2)
Y_score = classifier.fit(X_train, Y_train).decision_function(X_test)

In [None]:
fpr, tpr, _ = roc_curve(Y_test, Y_score)
roc_auc = auc(fpr, tpr)
print(roc_auc)

In [None]:
plt.figure()
plt.plot(fpr, tpr, label = 'ROC curve')
plt.plot([0,1], [0,1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()