In [None]:
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.metrics import classification_report, accuracy_score
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Loading the Dataset
#http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ['id','clump_thickness','uniform_cell_size','uniform_cell_shape','marginal_adhesion','single_epithelial_cell_size','bare_nuclei','bland_chromatin','normal_nucleoli','mitoses','class']
#data frame
df = pd.read_csv(url, names=names)

In [None]:
# Preprocess the data
# ? =missing values in dataset; -9999 means ignore them
df.replace('?',-9999,inplace=True)
print(df.axes)
#class= 4=>malignant(cancerous) ,2=>benign

#drop 'id' column 
df.drop(['id'],1,inplace=True)

#shape of dataset
print(df.shape)

In [None]:
#Dataset Visualization
print(df.describe())



In [None]:
#Plot histograms for each feature to understand distribution
df.hist(figsize=(10,10))
plt.show()

In [None]:
# Create scatter plot matrix
#tells whether linear classifier will work good for the dataset or not
#and to know realtionship between features
scatter_matrix(df,figsize=(10,10))
plt.show()


In [None]:

#create X and Y datasets for training 
print(df.loc[0])
X = np.array(df.drop(['class'],1))
print(X[0])
y = np.array(df['class'])
print(y[0])
print(X.shape,y.shape)
# No_of_rows X no_of_columns
#(no_of_rows,) => n X 1 vector or n-dim vector
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.2) #a random split into training and test sets
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)



In [None]:
#specify testing parameters
seed = 8 #not a random seed for reproducable results
scoring = 'accuracy'

In [None]:
#define the models to train
models = []
models.append(('KNN',KNeighborsClassifier(n_neighbors=5)))# odd value to avoid ties 
models.append(('SVM',SVC()))

#Evaluate each model
results = []
names = []

for name,model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed) #run 10 times and select the best results
    cv_results =model_selection.cross_val_score(model,X_train,y_train,cv=kfold,scoring=scoring) #fitting a model and computing the score
    results.append(cv_results)
    names.append(name)
    msg="%s: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(msg) #accuracy only on training data
print(results)

In [None]:
#Make predictions on validation dataset

for name,model in models:
    model.fit(X_train,y_train) #training the model
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test,predictions))
    print(classification_report(y_test,predictions))

In [None]:
clf =SVC()

clf.fit(X_train,y_train)
accuracy=clf.score(X_test,y_test)
print(accuracy)

example=np.array([[4,2,1,1,1,2,3,2,5]])
print(example.shape)
example=example.reshape(len(example),-1) #row to column vector
print(example.shape)

prediction=clf.predict(example)
print(prediction)