In [None]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
from scipy.stats import sem

get_ipython().magic('matplotlib notebook')

# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')

In [1]:
#Merge the two dataframes so we're only working with one file



#Rename the tumor volume column because it's annoying
pymaceuticals_complete = drug_results_df.rename(columns={'Tumor Volume (mm3)': 'Volume'})  

#Drop all drugs except for Capomulin, Infubinol, Ketapril, and Placebo because these are the only ones we need to analyze

drug_results_df = drug_results_df[drug_results_df['Drug'].isin(['Capomulin', 'Infubinol', 'Ketapril', 'Placebo'])]
drug_results_df.head(10)

NameError: name 'drug_results_df' is not defined

In [None]:
# Store the Mean Tumor Volume Data Grouped by Drug and Timepoint 

#First get rid of the number of metastatic sites and mice for this analysis because we don't need them

drug_results_tumor_df = pd.DataFrame({'Drug' : drug_results_df['Drug'], 'Timepoint' : drug_results_df['Timepoint'], 
                                     'Volume' : drug_results_df['Volume']})

#Get the average tumor volume for each drug/timepoint group and then put results into a new dataframe
drug_results_grp = drug_results_tumor_df.groupby(['Drug', 'Timepoint'])
drug_results_grp.mean()

drug_tumor_sum_df = pd.DataFrame(drug_results_grp.mean().reset_index())
drug_tumor_sum_df.head(10)

#rename the Volume column Average Volume

drug_tumor_sum_df.columns = ['Drug', 'Timepoint', 'Average Volume']
drug_tumor_sum_df.head(10)

In [None]:
# Store the Standard Error of Tumor Volumes Grouped by Drug and Timepoint


sem_drug = drug_tumor_sum_df.groupby('Drug').sem()

drug_results_grp = drug_results_tumor_df.groupby(['Drug', 'Timepoint']).sem()
drug_results_grp

#Convert the grouped object into a dataframe and then display a preview
drug_tumor_sem_df = pd.DataFrame(drug_results_grp.reset_index())
drug_tumor_sem_df.head(20)

#rename the Volume column to SEM

drug_tumor_sem_df.columns = ['Drug', 'Timepoint', 'SEM Volume']
drug_tumor_sem_df.head(10)

In [None]:
# Minor Data Munging to Re-Format the Data Frames

#Pivot the AVG volume dataframe so that the drugs are the columns and the index is the timepoint
drug_tumor_avg_df = drug_tumor_sum_df.pivot(index='Timepoint', columns = 'Drug', values = 'Average Volume')
drug_tumor_avg_df

#Pivot the SEM volume dataframe so that the drugs are the columns and the index is the timepoint
drug_tumor_sem_df = drug_tumor_sem_df.pivot(index='Timepoint', columns = 'Drug', values = 'SEM Volume')
drug_tumor_sem_df

In [None]:
#Build lists of mean and sem values for each drug.  These are needed 
#so each drug can be graphed separately and we need to send
#lists into the plot method.

#Capomulin
capo_means = drug_tumor_avg_df.loc[:, ['Capomulin']]
capo_means = capo_means['Capomulin']

capo_sem = drug_tumor_sem_df.loc[:, ['Capomulin']]
capo_sem = capo_sem['Capomulin']

#Infubinol
infub_means = drug_tumor_avg_df.loc[:, ['Infubinol']]
infub_means = infub_means['Infubinol']

infub_sem = drug_tumor_sem_df.loc[:, ['Infubinol']]
infub_sem = infub_sem['Infubinol']

#Ketapril
keta_means = drug_tumor_avg_df.loc[:, ['Ketapril']]
keta_means = keta_means['Ketapril']

keta_sem = drug_tumor_sem_df.loc[:, ['Ketapril']]
keta_sem = keta_sem['Ketapril']

#Placebo
placebo_means = drug_tumor_avg_df.loc[:, ['Placebo']]
placebo_means = placebo_means['Placebo']

placebo_sem = drug_tumor_sem_df.loc[:, ['Placebo']]
placebo_sem = placebo_sem['Placebo']

In [None]:
# Generate the Plot (with Error Bars)

#Get min and max values for the avg tumor volume so we set limits on the plot
min_volume = drug_tumor_sum_df.min()['Average Volume']
max_volume = drug_tumor_sum_df.max()['Average Volume']

#add/subtract 5 from the min and max volumes to give some additional white space on the plot
min_volume = min_volume - 5
max_volume = max_volume + 5

#Use the hls color space from Seaborn to give the plot a prettier look
current_palette_4 = sns.color_palette("hls", 4)
sns.set_palette(current_palette_4)

#Assign limits, turn on the background grid and create a title and labels
plt.grid(True)
plt.ylim(min_volume, max_volume)
plt.ylabel("Tumor Volume (mm3)")
plt.xlabel("Treatment Duration (Days)")
plt.title("Tumor Response to Treatment")

#Plot the average tumor size over time for each drug (with error bars)

#The values for the x-axis will be the timepoints
time = list(drug_tumor_avg_df.index.values)

#Plot Capomulin
plt.errorbar(time, capo_means, capo_sem, marker = "*", ms = 10, label = "Capomulin")

#Plot Infubinol
plt.errorbar(time, infub_means, infub_sem, marker = "s", ms = 8, label = "Infubinol")

#Plot Ketapril
plt.errorbar(time, keta_means, keta_sem, marker = "o", ms= 8, label = "Ketapril")

#Plot Placebo
plt.errorbar(time, placebo_means, placebo_sem, marker = "x", ms= 8, label = "Placebo")

#Add a legend, save the plot as a png file and then show the plot
plt.legend(frameon=True)

plt.savefig('Tumor Response.png') 

plt.show()

In [None]:
#Calculate the mean of the metastatic sites grouped by Drug and Timepoint

#First get rid of the volume and mice for this analysis because we don't need them
drug_results_met_df = pd.DataFrame({'Drug' : drug_results_df['Drug'], 'Timepoint' : drug_results_df['Timepoint'], 
                                     'Metastatic Sites' : drug_results_df['Metastatic Sites']})

#drug_results_met_df.head(20)

# Store the Mean Met. Site Data Grouped by Drug and Timepoint 
drug_mets_grp = drug_results_met_df.groupby(['Drug', 'Timepoint'])
drug_mets_grp.mean()

#Convert grouped results into a new dataframe and reset the index
drug_met_avg_df = pd.DataFrame(drug_mets_grp.mean().reset_index())
#drug_met_avg_df.head(20)

#rename the mets column Average Met Sites and preview the dataframe
drug_met_avg_df.columns = ['Drug', 'Timepoint', 'Average Met Sites']
drug_met_avg_df.head(10)

#Get the min and max of the average met sites to use as limits in our plot
min_mets = drug_met_avg_df.min()['Average Met Sites']
max_mets = drug_met_avg_df.max()['Average Met Sites']

In [None]:
## Store the Standard Error associated with Met. Sites Grouped by Drug and Timepoint 

met_sem_grp = drug_results_met_df.groupby(['Drug', 'Timepoint']).sem()

#Convert the grouped object into a dataframe and then display a preview
drug_mets_sem_df = pd.DataFrame(met_sem_grp.reset_index())
drug_mets_sem_df.head(20)

#rename the Metastatic Sites column to SEM
drug_mets_sem_df.columns = ['Drug', 'Timepoint', 'SEM Met Sites']
drug_mets_sem_df.head(20)

In [None]:
# Minor Data Munging to Re-Format the Data Frames

# Preview that Reformatting worked

#Pivot the AVG mets dataframe so that the drugs are the columns and the index is the timepoint
drug_met_avg_df = drug_met_avg_df.pivot(index='Timepoint', columns = 'Drug', values = 'Average Met Sites')
drug_met_avg_df.head(10)

#Pivot the SEM volume dataframe so that the drugs are the columns and the index is the timepoint
drug_mets_sem_df = drug_mets_sem_df.pivot(index='Timepoint', columns = 'Drug', values = 'SEM Met Sites')
drug_mets_sem_df.head(10)

In [None]:
#Build lists of mean and sem values for each drug.  These are needed 
#so each drug can be graphed separately and we need to send
#lists into the plot method.

#Capomulin
capo_met_means = drug_met_avg_df.loc[:, ['Capomulin']]
capo_met_means = capo_met_means['Capomulin']

capo_met_sem = drug_mets_sem_df.loc[:, ['Capomulin']]
capo_met_sem = capo_met_sem['Capomulin']

#Infubinol
infub_met_means = drug_met_avg_df.loc[:, ['Infubinol']]
infub_met_means = infub_met_means['Infubinol']

infub_met_sem = drug_mets_sem_df.loc[:, ['Infubinol']]
infub_met_sem = infub_met_sem['Infubinol']

#Ketapril
keta_met_means = drug_met_avg_df.loc[:, ['Ketapril']]
keta_met_means = keta_met_means['Ketapril']

keta_met_sem = drug_mets_sem_df.loc[:, ['Ketapril']]
keta_met_sem = keta_met_sem['Ketapril']

#Placebo
placebo_met_means = drug_met_avg_df.loc[:, ['Placebo']]
placebo_met_means = placebo_met_means['Placebo']

placebo_met_sem = drug_mets_sem_df.loc[:, ['Placebo']]
placebo_met_sem = placebo_met_sem['Placebo']
placebo_met_sem

In [None]:
#Plot the metastatic spread over time for each drug and include error bars

#Use the husl color palette from Seaborn
current_palette_8 = sns.color_palette("husl", 8)
sns.set_palette(current_palette_8)

#Add a bit to the max mets so that the lines aren't going off the edge of the graph
max_mets = max_mets + 0.5

#Set limits, turn on background gridlines, create labels
plt.grid(True)
plt.ylim(min_mets, max_mets)
plt.ylabel("# Of Met Sites")
plt.xlabel("Treatment Duration (Days)")
plt.title("Metastatic Spread During Treatment")

#Set the timepoints up on the x-axis
time = list(drug_met_avg_df.index.values)

#Plot Capomulin
plt.errorbar(time, capo_met_means, capo_met_sem, marker = "*", ms = 10, label = "Capomulin")

#Plot Infubinol
plt.errorbar(time, infub_met_means, infub_met_sem, marker = "s", ms = 8, label = "Infubinol")

#Plot Ketapril
plt.errorbar(time, keta_met_means, keta_met_sem, marker = "o", ms = 8, label = "Ketapril")

#Plot Placebo
plt.errorbar(time, placebo_met_means, placebo_met_sem, marker = "x", ms = 8, label = "Placebo")

#Add a legend, save the plot as a png file and then show the plot
plt.legend(frameon=True)

plt.savefig('Metastatic Spread.png') 

plt.show()

In [None]:
# Store the Count of Mice Grouped by Drug and Timepoint

#First create a new dataframe that only includes the mice, drug and timepoint columns
drug_results_survival_df = pd.DataFrame({'Mice' : drug_results_df['Mouse ID'],
                                         'Drug' : drug_results_df['Drug'], 'Timepoint' : drug_results_df['Timepoint']})
drug_results_survival_df.head()

#group the dataframe by drug and timepoint
data_mice_count = drug_results_survival_df.groupby(['Drug', 'Timepoint'])

#convert the grouped object into a dataframe and reset the index
survival_count_df = pd.DataFrame(data_mice_count.count().reset_index())

#rename the Mice column to # of Surviving Mice
survival_count_df = survival_count_df.rename(columns={'Mice' : '# of Surviving Mice'})
survival_count_df.head()

In [None]:
#Create a new dataframe that shows the survival rate as a percentage
survival_percent_df = pd.DataFrame({"Drug" : survival_count_df['Drug'], "Timepoint": survival_count_df['Timepoint'],
                                    "Survival %" : (survival_count_df['# of Surviving Mice'] / 25) * 100})
survival_percent_df.head()

In [None]:
#Pivot the count of mice survival dataframe so that the drugs are the columns and the index is the timepoint

survival_percent_df = survival_percent_df.pivot(index='Timepoint', columns = 'Drug', values = 'Survival %')

#Preview the dataframe
survival_percent_df

In [None]:

# Generate the Plot (Accounting for percentages)

current_palette_8 = sns.color_palette("husl", 8)
sns.set_palette(current_palette_8)

#get the minimum survival rate for our y-axis limit
min_survival_percent = survival_percent_df.min()
min_survival_percent = min_survival_percent.min()

#lower the minimum so our lines don't go off the sides of the figure
min_rate = min_survival_percent - 12

#Assign the x-axis values
x_axis = list(survival_percent_df.index.values)

#Assign the survival rates for each drug to variables
capo_survival_percent = survival_percent_df.loc[:, ['Capomulin']]
infub_survival_percent = survival_percent_df.loc[:, ['Infubinol']]
keta_survival_percent = survival_percent_df.loc[:, ['Ketapril']]
placebo_survival_percent = survival_percent_df.loc[:, ['Placebo']]

#Create the plot
plt.grid(True)
plt.ylim(min_survival_percent, 105)
plt.ylabel("Survival Rate (%)")
plt.xlabel("Treatment Duration (Days)")
plt.title("Survival During Treatment")
plt.plot(x_axis, capo_survival_percent, marker = "*", ms = 10, label = "Capomulin")
plt.plot(x_axis, infub_survival_percent, marker = "s", ms = 8, label = "Infubinol")
plt.plot(x_axis, keta_survival_percent, marker = "o", ms = 8, label = "Ketapril")
plt.plot(x_axis, placebo_survival_percent, marker = "x", ms = 8, label = "Placebo")

#Add a legend and specify that we want a tight layout
plt.legend(frameon=True)
plt.tight_layout()

# Save the Figure
plt.savefig('Survival Rates.png') 

#Display the figure
plt.show()

In [None]:
# Calculate the percent tumor volume changes for the 4 drugs

#Use the tumor volume average dataframe that was created in a prior step.
#Use the pct_change method to calculate the percentage change from the
#beginning of the drug trial to the end.  From 0 to 45 is 9 timepoints.  

volume_pct_changes = drug_tumor_avg_df.pct_change(periods=9).dropna()
volume_pct_changes = volume_pct_changes.reset_index()
print(volume_pct_changes)

#Figure out what the index is on the new dataframe
indexv = volume_pct_changes.keys()
#print(indexv)

In [None]:
#Store all of the percentage volume changes in a tuple
capo_vol_change = round(volume_pct_changes['Capomulin'][0] * 100, 2)

infub_vol_change = round(volume_pct_changes['Infubinol'][0] * 100, 2)

keta_vol_change = round(volume_pct_changes['Ketapril'][0] * 100, 2)

placebo_vol_change = round(volume_pct_changes['Placebo'][0] * 100, 2)

#Create a list that contains all of the above tuples.  The list will be used in our barchart.
tumor_vol_changes = [capo_vol_change, infub_vol_change, keta_vol_change, placebo_vol_change]
tumor_vol_changes


In [None]:
#Store all of the drug names in an array and drop the Timepoint column.
#The drug names will be used on the x-axis of the bar chart.
drugs_array = volume_pct_changes.columns.values
drugs = np.delete(drugs_array, [0])

#Set up the x-axis so that it represents each drug included in our analysis
x_axis = np.arange(len(drugs))

#set variables that determine if the tumor volume change is above or below 0.
#We need to do this because values below 0 will be in a different bar chart.
threshold = 0
above_threshold = np.maximum(tumor_vol_changes, threshold)
below_threshold = np.minimum(tumor_vol_changes, threshold)

#BUILD THE PLOT
#If the tumors grew (avg growth percentage was above zero), that means we have
#a bad result for the drug.  Turn the bars red in this case.  If the tumors
#shrunk (avg growth percentage was negative), turn the bars green.

#Assign 3 to the zorder argument so that the bars are in front of the grid.
#Make the width of the bars 0.5 so that there is space between them.

fig, ax = plt.subplots()
ax.bar(x_axis, below_threshold, 0.5, color="g", zorder=3)
ax.bar(x_axis, above_threshold, 0.5, color="r", zorder=3)

#create title and x,y labels
ax.set_title("Tumor Change Over 45 Day Treatment")
ax.set_ylabel("% Tumor Volume Change")

#create an hline to mark the 0 threshold
ax.axhline(0, color='grey')

#put a grid on the graph.  Use zorder of 0 so that it stays behind the bars.
ax.grid(zorder=0)

#Put the percentages on each bar
labels = [str(round(capo_vol_change)) + "%", str(round(infub_vol_change)) + "%", str(round(keta_vol_change)) + "%", 
                  str(round(placebo_vol_change)) + "%"]

rects = ax.patches

for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, label, ha='center', va='bottom')


#Put the drug names on the x-axis
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drugs)
        
# Save the Figure
plt.savefig('Tumor Change After 45 Days.png') 

#Display the figure
plt.show()