# Import libraries and set up default plot params

#### Note, this cell picks the path from which you want to load tha data and to which you want to save all figures as your current working directory (`cwd`).
#### If you want to load from/save to a different path, edit the `path`.

In [None]:
# Import libraries
import sys
import os

path = os.getcwd()

import matplotlib
import matplotlib.pyplot as plt

import numpy as np

import pandas as pd

import scipy.stats
from scipy.stats import norm, ks_2samp

# Set default tick label size
matplotlib.rcParams.update({'xtick.labelsize': 16})
matplotlib.rcParams.update({'ytick.labelsize': 16})

# Read in the data

In [None]:
# Read in the csv file
df = pd.read_csv(path + '/' + 'haberman.data.csv')

## Understand the data

#### The data frame has the following columns:
- AGE   == Age of patient at time of operation  
- YEAR  == Patient's year of operation (year - 1900)  
- NODES == Number of positive axillary nodes detected  
- STATE == Survival status,  
    - 1 == Patient survived 5 years or longer  
    - 2 == Patient died within 5 years

In [None]:
# Print the first 5 rows of the df
df.head()


#### Check the five-number summary (min, max, mean, stdev, quartiles) for each column

In [None]:
# Get statistics of the data set
df.describe()


#### Find the number of patients in each state, where state 1 means that the patient survived 5 years or longer, and state 2 tells us that the patient died within 5 years

In [None]:
# Count each state/status type
df['STATE'].value_counts()


## Divide the data into groups of interest

#### Divide the data into two subsets, based on the patient's state

In [None]:
# Split the data into two subsets
df_survival_yes = df[df['STATE'] == 1]
df_survival_no = df[df['STATE'] == 2]


#### Check the five-number summary (min, max, mean, stdev, quartiles) for each subset

In [None]:
# Get statistics of the data subset
df_survival_yes.describe()


In [None]:
# Get statistics of the data subset
df_survival_no.describe()


# Data Analysis

#### Compare the distributions of each parameter (age, year, nodes) for each group

In [None]:
fig = plt.figure(figsize=(8,8))

# Plot the histograms
plt.hist(df_survival_yes['AGE'], bins=np.arange(30,90,5), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))

plt.hist(df_survival_no['AGE'], bins=np.arange(30,90,5), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Find the best fit Gaussians for each distribution
xmin, xmax = [30,83]
x = np.linspace(xmin, xmax, 100)

mean_yes,std_yes = norm.fit(df_survival_yes['AGE'])
y_yes = norm.pdf(x, mean_yes, std_yes)
plt.plot(x, y_yes, lw=3, color='dodgerblue')

mean_no,std_no = norm.fit(df_survival_no['AGE'])
y_no = norm.pdf(x, mean_no, std_no)
plt.plot(x, y_no, lw=3, color='darkorange')

# Plot formatting
plt.xlabel('Age of patient at time of surgery',fontsize=20)
plt.ylabel('Relative fraction',fontsize=20)
plt.legend(loc=2, fontsize=14)

plt.xlim(30,83)
plt.ylim(0.000,0.055)

# Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_age_survival.pdf', fig=fig)


In [None]:
fig = plt.figure(figsize=(8,8))

# Plot the histograms
plt.hist(df_survival_yes['YEAR'], bins=np.arange(58,70,1), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))

plt.hist(df_survival_no['YEAR'], bins=np.arange(58,70,1), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Plot formatting
plt.xlabel('Year of operation',fontsize=20)
plt.ylabel('Relative fraction',fontsize=20)
plt.legend(loc=2, fontsize=14)

plt.ylim(0.00,0.20)

# Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_year_survival.pdf', fig=fig)


In [None]:
fig = plt.figure(figsize=(8,8))

# Plot the histograms
plt.hist(df_survival_yes['NODES'], bins=np.arange(0,55,2), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))

plt.hist(df_survival_no['NODES'], bins=np.arange(0,55,2), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Find the best fit Gaussians for each distribution
xmin, xmax = [-55,55]
x = np.linspace(xmin, xmax, 100)

# Want a Gaussian with mu = 0 nodes
# Create a new list of the nodes data that includes the nodes data, as well as the negative nodes data
df_nodes_gaussian_yes = []
for i,n in enumerate(df_survival_yes['NODES']):
    df_nodes_gaussian_yes.append(n)
    df_nodes_gaussian_yes.append(-1*n)

mean_yes,std_yes = norm.fit(df_nodes_gaussian_yes)
y_yes = norm.pdf(x, mean_yes, std_yes)
plt.plot(x, y_yes, lw=3, color='dodgerblue')

df_nodes_gaussian_no = []
for i,n in enumerate(df_survival_no['NODES']):
    df_nodes_gaussian_no.append(n)
    df_nodes_gaussian_no.append(-1*n)

mean_no,std_no = norm.fit(df_nodes_gaussian_no)
y_no = norm.pdf(x, mean_no, std_no)
plt.plot(x, y_no, lw=3, color='darkorange')

# Plot formatting
plt.xlabel('Number of positive axillary nodes detected',fontsize=20)
plt.ylabel('Relative fraction',fontsize=20)
plt.legend(loc=2, fontsize=14)

plt.xlim(0,55)
plt.ylim(0.00,0.40)

# Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_nodes_survival.pdf')


#### Combine all of those distributions into a single figure

In [None]:
fig, [ax1, ax2, ax3] = plt.subplots(1,3, figsize=(23,8))

ax1.hist(df_survival_yes['AGE'], bins=np.arange(30,90,5), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))

ax1.hist(df_survival_no['AGE'], bins=np.arange(30,90,5), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

xmin, xmax = [30,83]
x = np.linspace(xmin, xmax, 100)

mean_yes,std_yes = norm.fit(df_survival_yes['AGE'])
y_yes = norm.pdf(x, mean_yes, std_yes)
ax1.plot(x, y_yes, lw=3, color='dodgerblue')

mean_no,std_no = norm.fit(df_survival_no['AGE'])
y_no = norm.pdf(x, mean_no, std_no)
ax1.plot(x, y_no, lw=3, color='darkorange')

ax1.set_xlabel('Age of patient at time of surgery',fontsize=20)
ax1.set_ylabel('Relative fraction',fontsize=20)

ax1.set_xlim(30,84)

ax2.hist(df_survival_yes['NODES'], bins=np.arange(0,55,2), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))

ax2.hist(df_survival_no['NODES'], bins=np.arange(0,55,2), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

xmin, xmax = [-55,55]
x = np.linspace(xmin, xmax, 100)

df_nodes_gaussian_yes = []
for i,n in enumerate(df_survival_yes['NODES']):
    df_nodes_gaussian_yes.append(n)
    df_nodes_gaussian_yes.append(-1*n)

mean_yes,std_yes = norm.fit(df_nodes_gaussian_yes)
y_yes = norm.pdf(x, mean_yes, std_yes)
ax2.plot(x, y_yes, lw=3, color='dodgerblue')

df_nodes_gaussian_no = []
for i,n in enumerate(df_survival_no['NODES']):
    df_nodes_gaussian_no.append(n)
    df_nodes_gaussian_no.append(-1*n)

mean_no,std_no = norm.fit(df_nodes_gaussian_no)
y_no = norm.pdf(x, mean_no, std_no)
ax2.plot(x, y_no, lw=3, color='darkorange')

ax2.set_xlabel('Number of positive axillary nodes detected',fontsize=20)
ax2.set_ylabel('Relative fraction',fontsize=20)

ax2.set_xlim(0,54)

ax3.hist(df_survival_yes['YEAR'], bins=np.arange(58,70,1), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))

ax3.hist(df_survival_no['YEAR'], bins=np.arange(58,70,1), 
         lw=2, histtype='stepfilled', alpha=0.3, density=True, 
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

ax3.set_xlabel('Year of surgery',fontsize=20)
ax3.set_ylabel('Relative fraction',fontsize=20)

ax3.legend(loc=1, fontsize=14)

ax3.set_xlim(58,69)
ax3.set_ylim(0.00,0.20)

plt.tight_layout()

plt.savefig(path + '/' + 'haberman_survival.pdf', fig=fig)

#### And check whether any of the distributions are statistically different (between each subset of patients) using two-sample KS tests

In [None]:
ks, p = ks_2samp(df_survival_yes['AGE'], df_survival_no['AGE'])
print('KS two-samples test on age: ', ks, p)

if p < 0.05:
    print('Statstically significant')
else:
    print('NOT statstically significant')


In [None]:
ks, p = ks_2samp(df_survival_yes['NODES'], df_survival_no['NODES'],
                 alternative='greater')
print('KS two-samples test on number of nodes: ', ks, p)

if p < 0.05:
    print('Statstically significant')
else:
    print('NOT statstically significant')


In [None]:
ks, p = ks_2samp(df_survival_yes['YEAR'], df_survival_no['YEAR'])
print('KS two-samples test on year of operation: ', ks, p)

if p < 0.05:
    print('Statstically significant')
else:
    print('NOT statstically significant')
    

#### Make the cumulative distribution function (CDF) for the number of positive axillary nodes...

In [None]:
fig = plt.figure(figsize=(8,8))

# Plot the histograms and return the bins sizes and bin edges
counts_no, bin_edges_no, _ = plt.hist(df_survival_no['NODES'], bins=np.arange(0,55,1), 
         lw=2, histtype='step', density=True, cumulative=True,
         color='darkorange', 
         label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

counts_yes, bin_edges_yes, _ = plt.hist(df_survival_yes['NODES'], bins=np.arange(0,55,1), 
         lw=2, histtype='step', density=True, cumulative=True,
         color='dodgerblue', 
         label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
       
# Plot formatting
plt.xlabel('Number of positive axillary nodes detected',fontsize=20)
plt.ylabel('Cummulative fraction',fontsize=20)
plt.legend(loc=2, fontsize=14)

plt.ylim(0.00,1.20)

# Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_nodes_survival_CDF.pdf')


#### And use the CDF to determine the fraction of each group of patients that had fewer than 5 nodes

In [None]:
n_nodes = 5
print('{:.2f}% of patients who survived 5 years or longer after the surgery had fewer than {} nodes.'.format(counts_yes[n_nodes-1]*100.0, n_nodes))
print('But, {:.2f}% of patients who died within 5 years of the surgery ALSO had fewer than {} nodes.'.format(counts_no[n_nodes-1]*100.0, n_nodes))


#### Check for any obvious trends between parameters

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,8), sharex=True, sharey=True)

# Make scatter plots
ax[0].scatter(df_survival_yes['AGE'], df_survival_yes['NODES'], color='dodgerblue', 
              label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
ax[1].scatter(df_survival_no['AGE'], df_survival_no['NODES'], color='darkorange', 
              label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Plot formatting
ax[0].set_xlabel('Age of patient at time of surgery',fontsize=20)
ax[1].set_xlabel('Age of patient at time of surgery',fontsize=20)
ax[0].set_ylabel('Number of positive axillary nodes detected',fontsize=20)
ax[0].set_title('Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
ax[1].set_title('Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

#Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_nodes_v_age_survival.pdf')


In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,8), sharex=True, sharey=True)

# Make scatter plots
ax[0].scatter(df_survival_yes['YEAR'], df_survival_yes['NODES'], color='dodgerblue', 
              label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
ax[1].scatter(df_survival_no['YEAR'], df_survival_no['NODES'], color='darkorange', 
              label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Plot formatting
ax[0].set_xlabel('Year of surgery',fontsize=20)
ax[1].set_xlabel('Year of surgery',fontsize=20)
ax[0].set_ylabel('Number of positive axillary nodes detected',fontsize=20)
ax[0].set_title('Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
ax[1].set_title('Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_nodes_v_year_survival.pdf')


In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,8), sharex=True, sharey=True)

# Make scatter plots
ax[0].scatter(df_survival_yes['AGE'], df_survival_yes['YEAR'], color='dodgerblue', 
              label='Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
ax[1].scatter(df_survival_no['AGE'], df_survival_no['YEAR'], color='darkorange', 
              label='Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Plot formatting
ax[0].set_xlabel('Age of patient at time of surgery',fontsize=20)
ax[1].set_xlabel('Age of patient at time of surgery',fontsize=20)
ax[0].set_ylabel('Year of surgery',fontsize=20)
ax[0].set_title('Patient survived 5+ years after surgery, n={}'.format(len(df_survival_yes)))
ax[1].set_title('Patient died within 5 years of surgery, n={}'.format(len(df_survival_no)))

# Save figure
plt.tight_layout()

plt.savefig(path + '/' + 'haberman_year_v_age_survival.pdf')
