# Data pre-processing

In [None]:
# Author: Khoi Hoang
# Contact: hoanganhkhoil@gmail.com
# Project: Cancer Detector - applied machine learning to analyze medical records provided by Wisconsin hospital
# and predict whether a case is normal or cancer.

import numpy as np
from sklearn import preprocessing, cross_validation, svm
import pandas as pd

# Load the dataset
df = pd.read_csv('breast-cancer-wisconsin.data.txt')

# Replace unknown inputs with some extreme values
df.replace('?',-999999,inplace=True)

# Drop the id column from the dataset
df.drop(['id'], 1, inplace=True)

# Distribute X and y
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])

# Distribute X and y into trainning set and testing set
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)


# Training

In [None]:
# Create a classifier - using SVM classifier
clf = svm.SVC()

# Train the data
clf.fit(X_train, y_train)

# Test the data
accuracy = clf.score(X_test, y_test)

# Predict with some samples
X_sample = np.array(X[10:20], dtype=np.float64)
predictions = clf.predict(X_sample)

# Testing

In [3]:
print ("Accuracy: ", round(accuracy * 100,2), "%\n")

print ("Test some random 10 cases.\n")
print ("Actual output:\n")
for case in y[10:20]:
    if case == 2:
        print ("Normal")
    else:
        print ("Cancer detected")
        

print ("\n")
print ("Predicted output: \n")


for prediction in predictions:
    if prediction == 2:
        print ("Normal")
    else:
        print ("Cancer detected")

Accuracy:  97.86 %

Test some random 10 cases.

Actual output:

Normal
Normal
Cancer detected
Normal
Cancer detected
Cancer detected
Normal
Normal
Cancer detected
Normal


Predicted output: 

Normal
Normal
Cancer detected
Normal
Cancer detected
Cancer detected
Normal
Normal
Cancer detected
Normal
