## Import the relevant packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

In [None]:
# Read the CSV into a dataframe

header_names = ['sample_code_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class']
data_df = pd.read_csv('../data/breast-cancer-wisconsin - Copy.csv', header = None, names = header_names)



## Data wrangling

In [None]:
# Looking back at lecture, it is also possible to use regular expression to determine which entries are integers. Give this a try later.
indices_with_unexpected_values_in_bare_nuclei = ~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])

rows_with_unexpected_values_in_bare_nuclei = data_df[~data_df.bare_nuclei.isin(['1','2','3','4','5','6','7','8','9','10'])]

print(rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']])
missing_by_tumor_class = rows_with_unexpected_values_in_bare_nuclei[['bare_nuclei', 'class']].groupby('class').count()
population_by_tumor_class = data_df['class'].value_counts()

percentage_missing_in_benign = 100*missing_by_tumor_class.loc[2,'bare_nuclei']/population_by_tumor_class[2]
percentage_missing_in_malignant = 100*missing_by_tumor_class.loc[4,'bare_nuclei']/population_by_tumor_class[4]

print("Percentage of missing bare_nuclei in benign tumors: {0:2.2f}%".format( percentage_missing_in_benign))
print("Percentage of missing bare_nuclei in malignant tumors: {0:2.2f}%".format( percentage_missing_in_malignant))
# Class: (2 for benign, 4 for malignant)

data_df = data_df[~indices_with_unexpected_values_in_bare_nuclei]

data_df.bare_nuclei = data_df.bare_nuclei.astype('int64')

Since the missing values are a small portion of the entire dataset, it is safe to drop the missing data and change the type of bare_nuclei to int. 

In [None]:
data_df = data_df[~indices_with_unexpected_values_in_bare_nuclei]

data_df.bare_nuclei = data_df.bare_nuclei.astype('int64')
data_df.info()

In [None]:
attributes = ['sample_code_number', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion', 'cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitosis', 'class']
pd.plotting.scatter_matrix(data_df[attributes], figsize=(20,20), range_padding=0.1)

plt.savefig('scatter_matrix.png')

#plt.xlabels(rotation = 60)
plt.show()

In [None]:
correlation_matrix = data_df.corr()
correlation_matrix

In [None]:
diverging_cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(correlation_matrix, cmap=diverging_cmap, center=0)

In [None]:
correlation_matrix['class'].plot(kind='bar')
plt.xticks(rotation = 60)
plt.show()

## Observations

We can see extremely strong correlation between 'uniformity_cell_size' and 'class'. Similarly, 'uniformity_cell_shape' and 'bare_nuclei' are very strongly correlated with 'class'.

However, 'uniformity_cell_size' and 'uniformity_cell_shape' are very strongly correlated as well. We will have to be careful when using machine learning algorithms on these highly correlated features.

## Hypothesis testing

We will test whether there is significant difference between 'uniformity_cell_size' for benign vs malignant samples.

Lets define

$\mu_{UCS}^{benign}$:  'uniformity_cell_size' for benign samples

$\mu_{UCS}^{malignant}$:  'uniformity_cell_size' for malignant samples

and test the following hypothesis

$H_0$: $\mu_{UCS}^{benign} = \mu_{UCS}^{malignant}$

$H_a$: $\mu_{UCS}^{benign} \neq \mu_{UCS}^{malignant}$

In [None]:
# extract the two populations
benign_UCS = data_df[data_df['class'] == 2]['uniformity_cell_size']

malignant_UCS = data_df[data_df['class'] == 4]['uniformity_cell_size']


In [None]:
benign_UCS

In [None]:
malignant_UCS

In [None]:
# Define bootstrapping functions

def bootstrap_replicate_1d(data, func):
    return func(np.random.choice(data, size=len(data)))

def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data,func)

    return bs_replicates


In [None]:
# Look at the overall and individual population means

mean_UCS = np.mean(data_df['uniformity_cell_size'])
print(mean_UCS)

mean_UCS_benign = np.mean(benign_UCS)
mean_UCS_malignant = np.mean(malignant_UCS)

print(mean_UCS_benign)
print(mean_UCS_malignant)

In [None]:
# Shift the benign and malignant population means to the total population means assuming the null hypothesis


shifted_benign_UCS = benign_UCS - mean_UCS_benign + mean_UCS
shifted_malignant_UCS = malignant_UCS - mean_UCS_malignant + mean_UCS

In [None]:
# draw boostrap replicates
benign_UCS_replicates= draw_bs_reps(shifted_benign_UCS, np.mean, size=1000000)
malignant_UCS_replicates= draw_bs_reps(shifted_malignant_UCS, np.mean, size=1000000)

diff_of_means_bs_replicates = benign_UCS_replicates - malignant_UCS_replicates

# compute p-value
p_value = np.sum((np.abs(diff_of_means_bs_replicates) >= (mean_UCS_malignant - mean_UCS_benign)))/float(len(diff_of_means_bs_replicates))
print(p_value)

### Conclusion

Since p_value < 1e-6, we can reject the null hypothessis.

There is strong statistical evidence that the mean uniformity of cell size is different for benign versus malignant samples. 

There is also big practical difference (6.5-1.3 = 5.2 on a scale of 10) in the uniformity of cell size of benign versus malignant samples.

### Feature Selection
Features will be selected using SelectKBest function on training data only

In [None]:
X = data_df.drop(['class'], axis=1)
#X.head()
y = data_df['class']
#y

# Split the data into a training and test set.
X_train, X_test, y_train, ytest = train_test_split(X, y, test_size = 0.25,  random_state=42)

In [None]:
best_features = SelectKBest(score_func=chi2, k="all")
best_features.fit(X_train, y_train)

df_scores = pd.DataFrame(best_features.scores_)
df_pvalues = pd.DataFrame(best_features.pvalues_)
df_columns = pd.DataFrame(X_train.columns)

df_best_features = pd.concat( [df_scores, df_pvalues], axis=1)
df_best_features.columns = ['Score', 'p-value']
df_best_features.set_index(X_train.columns)

### Conclusion

All the features have a strong statistical significance with tissue classification. Therefore, all the features will used in predictive modeling.