In [20]:
%matplotlib notebook

In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [22]:
trial_data_df = pd.read_csv("data/clinicaltrial_data.csv")
mouse_data_df = pd.read_csv("data/mouse_drug_data.csv")

In [23]:
combined_clinical_data_df = pd.merge(trial_data_df,
                                     mouse_data_df,
                                     on="Mouse ID")

In [25]:
#This function returns a pivoted data frame for our time series analysis. 

#We first reset the index of the original data frames to remove any current indices.
#Once reset, we can pivot the dataframes, such that: 
# 1. Timepoint becomes the index, 
# 2. The individual drugs become the columns, 
# 3. The mean and sem values become the main data values. 

def pivotDF(df, index, cols, vals):
    df = df.reset_index()
    df = df.pivot(index=index, 
                  columns=cols, 
                  values=vals)
    return df

In [26]:
# get mean tumor volume and then pivot dataframe for time series analysis              
mean_tumor_volume_df = combined_clinical_data_df.groupby(["Drug", "Timepoint"]).mean()[["Tumor Volume (mm3)"]]
mean_tumor_volume_df = pivotDF(df = mean_tumor_volume_df, 
                                index="Timepoint", 
                                cols="Drug", 
                                vals="Tumor Volume (mm3)")

# get SEM tumor volume and then pivot dataframe for time series analysis   
sem_tumor_volume_df = combined_clinical_data_df.groupby(["Drug", "Timepoint"]).sem()[["Tumor Volume (mm3)"]]
sem_tumor_volume_df = pivotDF(df = sem_tumor_volume_df, 
                                index="Timepoint", 
                                cols="Drug", 
                                vals="Tumor Volume (mm3)")

In [27]:
#This function is called for each of our three output charts. 
#We create a function, instead of charting each one individually, so that we can set 
#default formatting options governing the major chart features, while still providing an override on 
#some of the options 

def chartTrialOutcomes(val_df, sem_df, title, xlabel, ylabel, file,
                       xlim_offset = [0,0], ylim_offset=[0,0], 
                       legend_loc = "best" ):
    
    fig, ax = plt.subplots()
    x_axis = val_df.index
    
    #individual columns to output. These can be updated at will. 
    columns_to_chart = ["Capomulin","Infubinol","Ketapril","Placebo" ]
   
    #line colors to associate with each of the named columns above. We use their position in the list to tie the relationship together. 
    colors = ["red","blue","green","black"]

    #markers to associate with each of the named columns above. We use their position in the list to tie the relationship together. 
    markers = ["o","^","s","D"]
    
    #create a line plot with error bars for each of the named columns
    for i, col in enumerate(columns_to_chart):
        ax.errorbar(x = x_axis,             
                    y = val_df[col],
                    yerr = sem_df[col],
                    c=colors[i], 
                    marker=markers[i], 
                    ls="--", 
                    lw=0.25,
                    label=col)

    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
 
    #set x-axis values  
    xmin = min(x_axis) + xlim_offset[0]
    xmax = max(x_axis) + xlim_offset[1]
    ax.set_xlim(xmin, xmax)    
    
    #set y-axis values
    yticks = ax.get_yticks()
    ymin = min(yticks) + ylim_offset[0]
    ymax = max(yticks) + ylim_offset[1]
    ax.set_ylim(ymin, ymax)
    ax.hlines(y=[yticks],xmin=xmin,xmax=xmax,alpha=0.05)    
    
    #set legend - this defaults to "best"
    ax.legend(loc=legend_loc)
    
    plt.savefig(file)

In [28]:
chartTrialOutcomes(val_df = mean_tumor_volume_df, 
                   sem_df = sem_tumor_volume_df,title = "Tumor Response to Treatment", 
                   xlabel = "Time (Days)",
                   ylabel = "Tumor Volume (mm3)", 
                   file = "charts/treatment.png",
                   xlim_offset = [-2,2],
                   ylim_offset = [3, -1])

<IPython.core.display.Javascript object>

10


In [30]:
# get mean number of metastatic sites and then pivot dataframe for time series analysis                 
mean_metastatic_sites_df = combined_clinical_data_df.groupby(["Drug", "Timepoint"]).mean()[["Metastatic Sites"]]
mean_metastatic_sites_df = pivotDF(df = mean_metastatic_sites_df, 
                                    index="Timepoint", 
                                    cols="Drug", 
                                    vals="Metastatic Sites")

# get SEM of metastatic sites  and then pivot dataframe for time series analysis    
sem_metastatic_sites_df = combined_clinical_data_df.groupby(["Drug", "Timepoint"]).sem()[["Metastatic Sites"]]
sem_metastatic_sites_df = pivotDF(df = sem_metastatic_sites_df, 
                                   index="Timepoint", 
                                   cols="Drug", 
                                   vals="Metastatic Sites")


In [31]:
chartTrialOutcomes(val_df = mean_metastatic_sites_df,
                   sem_df = sem_metastatic_sites_df,
                   title = "Metastatic Spread during Treatment", 
                   xlabel = "Treatment Duration (Days)", 
                   ylabel = "Met. Sites", 
                   file = "charts/spread.png",
                   xlim_offset=[-2,2],
                   ylim_offset=[0.2,-0.1])



<IPython.core.display.Javascript object>

10


In [33]:
# get survival count of mice and then pivot dataframe for time series analysis   
count_micesurvived_df = combined_clinical_data_df.groupby(["Drug", "Timepoint"]).count()[["Mouse ID"]]
count_micesurvived_df = pivotDF(df = count_micesurvived_df, 
                                 index="Timepoint", 
                                 cols="Drug", 
                                 vals="Mouse ID")

# deep copy count_micesurvived_df into new df, so we can update the values from a simple count to the survival rate per drug
rate_micesurvived_df = count_micesurvived_df.copy()

#return survival rate by taking the number of mice at the start of each trial, 
#and then reading in the survival count at each time point of the trial.  
def returnSurvivalRate(mice_at_start, count_at_timepoint):
    survival_rate = (1 - ((mice_at_start - count_at_timepoint) / mice_at_start) ) * 100
    return survival_rate

#loop through each column of the new df and apply returnSuvivalRate() for each value in a list comprehension
for col in rate_micesurvived_df.columns:
    mice_at_start = rate_micesurvived_df.loc[0,col]
    rate_micesurvived_df[col] = [returnSurvivalRate(mice_at_start, x) for x in rate_micesurvived_df[col] ]

In [35]:
#chartTrialOutcomes expects a df with SEM scores to calculate error bars, but we don't need error bars 
#for a simple count, so we create a df with error scores of 0 at each point. 

#First, we duplicate the STRUCTURE of rate_micesurvived_df, but do not copy the data. 
#This saves time & memory, while still giving us a perfect structural represenation of our main df 
sem_ratesurvied_df = pd.DataFrame().reindex_like(rate_micesurvived_df)

#then we quickly fill each column and row with 0
for col in sem_ratesurvied_df:
    sem_ratesurvied_df[col] = [0 for x in sem_ratesurvied_df[col]]
    

In [36]:
chartTrialOutcomes(val_df = rate_micesurvived_df,
                   sem_df = sem_ratesurvied_df, 
                   title = "Survival during Treatment", 
                   xlabel = "Time (Days)", 
                   ylabel = "Survival Rate (%)", 
                   file = "charts/survival.png",
                   xlim_offset=[-2,2],
                   ylim_offset=[3,-2],
                   legend_loc = "lower left")

<IPython.core.display.Javascript object>

10
