## Import the relevant packages

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

In [7]:
# Read the CSV into a dataframe

header_names = ['sample_code_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class']
data_df = pd.read_csv('breast-cancer-wisconsin - Copy.csv', header = None, names = header_names)

# Looking back at lecture, it is also possible to use regular expression to determine which entries are integers. Give this a try later.
indices_with_unexpected_values_in_bare_nuclei = ~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])

rows_with_unexpected_values_in_bare_nuclei = data_df[~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])]

print(rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']])
missing_by_tumor_class = rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']].groupby('class').count()
population_by_tumor_class = data_df['class'].value_counts()

percentage_missing_in_benign = 100*missing_by_tumor_class.loc[2,'bare_nuclei']/population_by_tumor_class[2]
percentage_missing_in_malignant = 100*missing_by_tumor_class.loc[4,'bare_nuclei']/population_by_tumor_class[4]

print("Percentage of missing bare_nuclei in benign tumors: {0:2.2f}%".format( percentage_missing_in_benign))
print("Percentage of missing bare_nuclei in malignant tumors: {0:2.2f}%".format( percentage_missing_in_malignant))
# Class: (2 for benign, 4 for malignant)

data_df = data_df[~indices_with_unexpected_values_in_bare_nuclei]

data_df.bare_nuclei = data_df.bare_nuclei.astype('int64')

    bare_nuclei  class
23            ?      4
40            ?      2
139           ?      2
145           ?      2
158           ?      2
164           ?      2
235           ?      2
249           ?      2
275           ?      2
292           ?      4
294           ?      2
297           ?      2
315           ?      2
321           ?      2
411           ?      2
617           ?      2
Percentage of missing bare_nuclei in benign tumors: 3.06%
Percentage of missing bare_nuclei in malignant tumors: 0.83%


Drop the missing data and change the type of bare_nuclei to int. 

In [8]:
data_df = data_df[~indices_with_unexpected_values_in_bare_nuclei]

data_df.bare_nuclei = data_df.bare_nuclei.astype('int64')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
sample_code_number       683 non-null int64
clump_thickness          683 non-null int64
uniformity_cell_size     683 non-null int64
uniformity_cell_shape    683 non-null int64
marginal_adhesion        683 non-null int64
cell_size                683 non-null int64
bare_nuclei              683 non-null int64
bland_chromatin          683 non-null int64
normal_nucleoli          683 non-null int64
mitosis                  683 non-null int64
class                    683 non-null int64
dtypes: int64(11)
memory usage: 64.0 KB


  """Entry point for launching an IPython kernel.


In [None]:

data_df.plot(subplots=True,figsize = (16,16), kind = 'bar')
#plt.savefig('data_plot.jpg')
plt.show()


ValueError: Format 'jpg' is not supported (supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz)

In [None]:
attributes = ['sample_code_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class']
pd.plotting.scatter_matrix(data_df[attributes], figsize=(32,32), range_padding=0.1)

plt.savefig('scatter_matrix.jpg')

#plt.xlabels(rotation = 60)
plt.show()

In [None]:
correlation_matrix = data_df.corr()
correlation_matrix

In [None]:
diverging_cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(correlation_matrix, cmap=diverging_cmap, center=0)

In [None]:
correlation_matrix['class'].plot(kind='bar')
plt.xticks(rotation = 60)
plt.show()

In [None]:
benign_UCS = data_df[data_df['class'] == 2]['uniformity_cell_size']

malignant_UCS = data_df[data_df['class'] == 4]['uniformity_cell_size']


In [None]:
benign_UCS

In [None]:
malignant_UCS

In [None]:
def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data,func)

    return bs_replicates


In [None]:
mean_UCS = np.mean(data_d)


shifted_empirical_data_male = empirical_data_male - np.mean(empirical_data_male) + mean_temperature
shifted_empirical_data_female = empirical_data_female - np.mean(empirical_data_female) + mean_temperature

In [None]:
# draw boostrap replicates
temp_bs_replicates_male = draw_bs_reps(shifted_empirical_data_male, np.mean, size=10000)
temp_bs_replicates_female = draw_bs_reps(shifted_empirical_data_female, np.mean, size=10000)
diff_of_means_bs_replicates = temp_bs_replicates_male - temp_bs_replicates_female

# compute p-value
p_value = np.sum((np.abs(diff_of_means_bs_replicates) >= empirical_difference_males_females))/float(len(diff_of_means_bs_replicates))
print p_value



2.2.1 Features
The system that they used is called Xcyt, which was written by one of the coauthors
in his Ph.D. dissertation [Street, 1994]. A fine needle aspirate is taken directly
from a lump in a patient’s breast. The extracted fluid is then stained to emphasize the
nuclei of the cells in the fluid. Then, a digital image of the fluid is taken.
In the previous two papers that used mammograms for diagnosis, the authors
simply computed two features from the digital images. In this paper, the authors
computed 30 features from each image. These features are: “area, radius, perimeter,
symmetry, number and size of concavities, fractal dimension (of the boundary),
compactness, smoothness (local variation of radial segments), and texture (variance of
gray levels inside the boundary)” [Mangasarian]. For each of these ten features, the
authors calculated the mean value, extreme value, and standard error, totaling 30 features.
These 30 features will serve as input to the diagnosis tool.
