In [206]:
import pandas as pd
import numpy as np
from decimal import Decimal
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [221]:
cancer_df = pd.read_csv('wdbc.data.csv').dropna()
cancer_df.head()

Unnamed: 0,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,...,1.22,1.23,1.24,1.25,1.26,1.27,1.28,1.29,1.30,1.31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [238]:
X = cancer_df.drop(['1', '1.1'], 1)
y = cancer_df['1.1']
#y.columns = ['target']
#y = y.where(y == "M", 0)
#y = y.where(y != "M", 1)

In [223]:
columnlist = []
for x in range(0,len(X.columns),1):
    columnlist.append("feature{}".format(x))

X.columns = columnlist
X.head()

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature20,feature21,feature22,feature23,feature24,feature25,feature26,feature27,feature28,feature29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [224]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Classification with no data engenering using naive bayes model

In [225]:
#naive bayes model for multiclassification
bnb = BernoulliNB()

bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 63.16%


In [226]:
conMat = confusion_matrix(y_test, y_pred)
conMat

array([[72,  0],
       [42,  0]], dtype=int64)

In [227]:
true_positive = conMat[0][0]/(conMat[0][0]+conMat[0][1])
true_positive

1.0

# k-fold for each confusion_matrix
# cross val for overfitting
# (f1 for presion + recall) lookover

# using SVM

In [228]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.61%


In [229]:
conMat = confusion_matrix(y_test, y_pred)
conMat

array([[72,  0],
       [ 5, 37]], dtype=int64)

In [230]:
true_positive = conMat[0][0]/(conMat[0][0]+conMat[0][1])
true_positive

1.0

In [231]:
true_negative = 100*round(Decimal(conMat[1][1]/(conMat[1][0]+conMat[1][1])), 2)
true_negative

Decimal('88.00')

# Do the two outcome have equal number of data points?

In [232]:
benign = y[y == 'B'].count()
malignant = y[y == 'M'].count()
print("Number of benign case: {}".format(benign))
print("Number of malignant case: {}".format(malignant))
print('Proportion:', round(benign / malignant, 2), ': 1')
#they seem to propotionate enough

Number of benign case: 357
Number of malignant case: 212
Proportion: 1.68 : 1


# Apply PCA to the data

In [239]:
###### High multi-corrilation, applying PCA
# Standardizing the features
X = pd.DataFrame(StandardScaler().fit_transform(X))

pca = PCA(n_components=0.9, svd_solver='full')
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)

In [240]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Naive Bayes agian

In [241]:
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)

conMat = confusion_matrix(y_test, y_pred)
print(conMat)
true_positive = 100*round(Decimal(conMat[0][0]/(conMat[0][0]+conMat[0][1])), 2)
print('True positive: {}'.format(true_positive))
true_negative = 100*round(Decimal(conMat[1][1]/(conMat[1][0]+conMat[1][1])), 2)
print('True negative: {}'.format(true_negative))

[[71  1]
 [ 6 36]]
True positive: 99.00
True negative: 86.00


In [242]:
pca.explained_variance_ratio_

array([ 0.44272026,  0.18971182,  0.09393163,  0.06602135,  0.05495768,
        0.04024522,  0.02250734])

# SVM again

In [245]:
svm = SVC(kernel = 'linear')
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

conMat = confusion_matrix(y_test, y_pred)
print(conMat)
true_positive = 100*round(Decimal(conMat[0][0]/(conMat[0][0]+conMat[0][1])), 2)
print('True positive: {}'.format(true_positive))
true_negative = 100*round(Decimal(conMat[1][1]/(conMat[1][0]+conMat[1][1])), 2)
print('True negative: {}'.format(true_negative))

[[71  1]
 [ 3 39]]
True positive: 99.00
True negative: 93.00


# use different models (pick the best model)
# apply the models to differet datasets (how robust is your models is)