In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

# Read the CSV into a dataframe

header_names = ['sample_code_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class']
data_df = pd.read_csv('breast-cancer-wisconsin - Copy.csv', header = None, names = header_names)

# Looking back at lecture, it is also possible to use regular expression to determine which entries are integers. Give this a try later.
indices_with_unexpected_values_in_bare_nuclei = ~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])

rows_with_unexpected_values_in_bare_nuclei = data_df[~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])]

print(rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']])
missing_by_tumor_class = rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']].groupby('class').count()
population_by_tumor_class = data_df['class'].value_counts()

percentage_missing_in_benign = 100*missing_by_tumor_class.loc[2,'bare_nuclei']/population_by_tumor_class[2]
percentage_missing_in_malignant = 100*missing_by_tumor_class.loc[4,'bare_nuclei']/population_by_tumor_class[4]

print("Percentage of missing bare_nuclei in benign tumors: {0:2.2f}%".format( percentage_missing_in_benign))
print("Percentage of missing bare_nuclei in malignant tumors: {0:2.2f}%".format( percentage_missing_in_malignant))
# Class: (2 for benign, 4 for malignant)

data_df = data_df[~indices_with_unexpected_values_in_bare_nuclei]

data_df.bare_nuclei = data_df.bare_nuclei.astype('int64')



    bare_nuclei  class
23            ?      4
40            ?      2
139           ?      2
145           ?      2
158           ?      2
164           ?      2
235           ?      2
249           ?      2
275           ?      2
292           ?      4
294           ?      2
297           ?      2
315           ?      2
321           ?      2
411           ?      2
617           ?      2
Percentage of missing bare_nuclei in benign tumors: 3.06%
Percentage of missing bare_nuclei in malignant tumors: 0.83%


In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [8]:
X = data_df.drop(['class'], axis=1)
#X.head()
y = data_df['class']
#y

In [10]:


# Split the data into a training and test set.
X_train, X_test, y_train, ytest = train_test_split(X, y, test_size = 0.25,  random_state=42)

In [None]:



clf = SVC(kernel='linear')

parameters = {'C': [0.01, 1, 100], 'gamma': [0.001, 0.01, 1]}
grid = GridSearchCV(estimator=clf, param_grid=parameters, cv=5)

grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
print(grid.scorer_)
print(grid.best_estimator_)

print(grid.score(X_test, ytest))