In [10]:
# Breast Cancer Detection with SVM and KNN

# Download Conda: install Python & Jupyter

# Using command prompt:

# cd tutorial

# conda install sklearn

# jupyter notebook

import sys  #Check system version.
import numpy
import matplotlib
import pandas
import sklearn

ModuleNotFoundError: No module named 'numpy'

In [None]:
print('Python: ()'.format.sys.version)
print('Numpy: ()'.format.numpy.__version__)
print('matplotlib: ()'.format.matplotlib.__version__)
print('pandas: ()'.format.pandas.__version__)
print('sklearn: ()'.format.sklearn.__version__)

# Python: 2.7.13
# Numpy: 1.14.0
# Matplotlib: 2.1.0
# pandas: 0.21.0
# sklearn: 0.19.1

In [8]:
# Import Packages/Libraries:

import numpy as np
from sklearn import preprocessing, cross_validation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import classification_report, accuracy_score
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import pandas as pd

# DeprecationWarning.

# Model_selection imports cross_validation

# Loading the dataset

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"

# Names

names = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape', 'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

# DataFrame: Read csv file.

df = pd.read_csv(url, names = names)

ModuleNotFoundError: No module named 'numpy'

In [11]:
# Want to do preprocessing on data/ do visualizations.
# Important to visualize data to understand which machine learning algorithm to choose.

# Preprocess the data: We have a lot of data with some missing data.

df.replace('?', -99999, inplace=True)
print(df.axes)

df.drop(['id'], 1, inplace=True)  # Drop id class. 
# Don't want to do machine learning on the id column, because it doesn't tell us anything unique about the information, relevant to classification.


# OUT: RangeIndex(start=0, stop=699, step=1), Index([...])

# 699 different data points. 
# Each case has 10 different columns: u'id', u'clump_thickness', ...

# Print the shape of the dataset
print(df.shape)

# OUT: (699, 11) - includes id. Want to drop!!!

# Do dataset_visualizations.
print(df.loc[0])



#################################################

# clump_thickness         5
# uniform_cell_size       1
# uniform_cell_shape      1
# marginal adhesion       1
# signle_epithelial_size  2
# bare_nuclei             1
# bland_chromatin         3
# normal_nucleoli         1
# mitoses                 1
# class                   2

# Name: 0, dtype: object

##################################################

# print(df.loc[698]) - Malignant

# print(df.loc[6]) - Benign

# Class: 2 - Benign, 4 Malignant

print(df.describe())

# Gives mean and standard deviation.

# Mean is closer to 2.

# min and max between 1 and 10.

# 25% - 25 or less of data averages at 1.000000 for marginal_adhesion

# Plot histograms for each variable.

df.hist(figsize=(10,10))

plt.show()

# Nice output showing a histogram of each feature.

# most of these features have data at 1.

# Clump_thickness is easily distributed.

# Create scatter plot matrix.

scatter_matrix(df, figsize = (18,18))
plt.show()

# Scatter matrix: tells us if a linear classifier is a good classifier, or do we need to invest in more complicated linear fit.

# Shows relationship between one variable and every other variable.

# NOTE: Uniform_Cell_Shape and Uniform_Cell_Size both have a strong linear relationship.

# No one else has a good linear relationship.

# No easy way to classify.

NameError: name 'df' is not defined

In [12]:
# First step:

# Split into X and Y datasets for training.

X=np.array(df.drop(['class'],1))
y=np.array(df['class'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size = 0.2)

# Specify testing options.

seed = 8  # Makes it reproducible. W/ K means clustering algorithm. Starting with a random seed it will change the results each time. Defining a seed/staying consistent: will be able to reproduce each other's results.
scoring = 'accuracy'

# Define the models to train.

models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5))) 
models.append(('SVM', SVC())


# Updates the list by adding an object to the list. 
# append(): It is basically used in Python to add one element. # extend(): Where extend(), is used to merge two lists or insert multiple elements in one list.

### Evaluate each model in turn.

results = []
names = []

for name, model in models:
  kfold = model_selection.KFold(n_splits = 10, random_state = seed)
  cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring = scoring)
  results.append(cv_result)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

# Randomly sorts the data.

# KNN: 0.967825 (0.03552)
# SVM: 0.948117 (0.024558)

# KNN - tries to cluster all the data points into groups.
# SVM - looking for optimal separating hyperplane to separate these data points into malignant cells and benign cells.

# Want to make predictions on validation dataset.

for name, model in models:
  model.fit(X_train, y_train) # Fit model with training data.
  predictions = model.predict(X_test) # Make predictions with test data.
  print(name)
  print(accuracy_score(y_test, predictions))
  print(classification_report(y_test, predictions))

# Check accuracy score & Classification Report using both KNN and SVM Models.

# On KNN, accuracy at 95%.

# On SVM, 0.9357142857...% accuracy.

# Precision: A measure of correctly predicted false positives. Ratio of correctly predicted positive observations over the total predicted positive observations.
# High precision: not that many false positives.
# At SVM, much lower score on the precision.

# In case of cancer, 
### WE WANT TO MINIMIZE THE NUMBER OF FALSE POSITIVES. ###

# Don't want to tell someone they have cancer when not actually the case.

# Recall is a matter of false negatives.

# No false negatives in SVM dataset.

# clf = SVC()

# clf.fit(X_train, y_train)

# accuracy = clf.score(X_test, y_test)

print(accuracy)

SyntaxError: invalid syntax (<ipython-input-12-dea7466aac35>, line 27)

In [None]:
# Build our own example of a breast cancer cell:

example = np.array([[4,2,1,1,1,2,3,2,10]]) # Use example of a malignant cancer cell.
example = example.reshape(len(example), -1)
prediction = clf.predict(example)
print(prediction)

# 0.9642857142857143 - CLASS 4 : Malignant

### IMPLIES: 96.42857142857143 % positive that the datapoints are malignant.

example = np.array([[4,2,1,1,1,2,3,2,4]]) # Use example of a benign cancer cell.
example = example.reshape(len(example), -1)
prediction = clf.predict(example)
print(prediction)

# 0.9642857...%
[2]

### QUICK REVIEW ###

# Imported data from UCI Repository.

# Preprocessed data by removing id column.

# .describe() for mean, max, min, quartiles...

# Used histogram to understand distribution of different features.

# Create scatter plot matrix to show relationships between some of our features. 
# Linear relationship between shape and size. As size increases, shape increases. If had highly linear relationships, would use linear methods effectively.

    #scatter_matrix(df, figsize = (10,10))
    #plt.show()

# Use training set, testing set, and validation set.

# Built KNN Classifier and Support Vector Classifier:
# KNN - groups data into clusters.
# Support vector - separates data by hyperplanes.

# Compare results using classification report with items such as accuracy, overall accuracy, precision and recall, as well as support.

# Finally, built our own cell and explored what it would take to actually get a malignant and benign classification. 
# Test last point on index: from 10 to 4.