## Import the relevant packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

In [None]:
# Read the CSV into a dataframe

header_names = ['sample_code_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class']
data_df = pd.read_csv('breast-cancer-wisconsin - Copy.csv', header = None, names = header_names)


In [None]:
# Explore the dataframe

data_df.info()
data_df.head()

We notice above that bare_nuclei is stored as an object instead of expected integer value. Lets determine what unexpected values are preventing integer values to be stored as object.

In [None]:
# Looking back at lecture, it is also possible to use regular expression to determine which entries are integers. Give this a try later.
indices_with_unexpected_values_in_bare_nuclei = ~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])

rows_with_unexpected_values_in_bare_nuclei = data_df[~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])]
print(rows_with_unexpected_values_in_bare_nuclei.bare_nuclei)
print("{} values are stored as '?'".format(len(rows_with_unexpected_values_in_bare_nuclei.bare_nuclei)) )

We observe that 16 records of bare_nuclei are stored as '?' but we do not expect these records to be missing. The missing values in the bare_nuclei_column are a small percentage of the benign and malignant tumors. Therefore, we can drop these records without significant impact to the solution of our classification problem.

In [None]:
print(rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']])
missing_by_tumor_class = rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']].groupby('class').count()
print(missing_by_tumor_class)

percentage_missing_in_benign = 100*missing_by_tumor_class.loc[2,'bare_nuclei']/population_by_tumor_class[2]
percentage_missing_in_malignant = 100*missing_by_tumor_class.loc[4,'bare_nuclei']/population_by_tumor_class[4]

print("Percentage of missing bare_nuclei in benign tumors: {0:2.2f}%".format( percentage_missing_in_benign))
print("Percentage of missing bare_nuclei in malignant tumors: {0:2.2f}%".format( percentage_missing_in_malignant))
# Class: (2 for benign, 4 for malignant)


Drop the missing data and change the type of bare_nuclei to int. Also, change the class of tumor to category type

In [None]:
data_df = data_df[~indices_with_unexpected_values_in_bare_nuclei]

data_df.bare_nuclei = data_df.bare_nuclei.astype('int64')
data_df.info()

Look for outliers

In [None]:
print(data_df.describe())
data_df.boxplot([ 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class'])
plt.xticks(rotation = 60)

It can be seen that most of the attributes have reasonable spread. Interestingly, the median value is closer to the lower end of the range for all the attributes. 

Only mitosis attribute appears to have a large number of outliers. In fact, its inter quartile range (IQR) is limited to the value 1. After looking into the original data description, this attribute is determined by classifying how infrequently cell mitosis is occuring on a scale of 1 to 10. At this point there is no reason to think that these outliers are invalid.

In [None]:
data_df.mitosis.plot(kind='hist')

In [None]:

data_df.plot(subplots=True,figsize = (16,16), kind = 'bar')
plt.savefig('data_plot.jpg')
plt.show()


In [None]:
data_df.plot(subplots=True,figsize = (16,16), kind = 'bar')
plt.show()