## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import sem
import csv
import numpy as np


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data = pd.merge(mouse_metadata, study_results, on=("Mouse ID"))

# Display the data table for preview
combined_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [2]:
# Checking the number of mice.
mouse = len(combined_data["Mouse ID"].unique())
mouse

249

In [3]:
combined_data.count()

Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
setdata = combined_data.set_index("Mouse ID", "Timepoint")

In [5]:
unique_mouse = combined_data["Mouse ID"].unique()
unique_drug = combined_data["Drug Regimen"].unique()

In [6]:
# Optional: Get all the data for the duplicate mouse ID.
rami_drug = setdata[setdata["Drug Regimen"] == "Ramicane"]
cap_drug = setdata[setdata["Drug Regimen"] == "Capomulin"]
inf_drug = setdata[setdata["Drug Regimen"] == "Infubinol"]
pla_drug = setdata[setdata["Drug Regimen"] == "Placebo"]
ceft_drug = setdata[setdata["Drug Regimen"] == "Ceftamin"]
stel_drug = setdata[setdata["Drug Regimen"] == "Stelasyn"]
zoni_drug = setdata[setdata["Drug Regimen"] == "Zoniferol"]
ket_drug = setdata[setdata["Drug Regimen"] == "Ketapril"]
prop_drug = setdata[setdata["Drug Regimen"] == "Propriva"]
naft_drug = setdata[setdata["Drug Regimen"] == "Naftisol"]

In [7]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_data = combined_data.drop_duplicates("Mouse ID")

In [8]:
# Checking the number of mice in the clean DataFrame.
mouse_data.count()

Mouse ID              249
Drug Regimen          249
Sex                   249
Age_months            249
Weight (g)            249
Timepoint             249
Tumor Volume (mm3)    249
Metastatic Sites      249
dtype: int64

## Summary Statistics

In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
rtum_volume = rami_drug["Tumor Volume (mm3)"]
rmean = np.mean(rtum_volume)
rmedian = np.median(rtum_volume)
rvar = np.var(rtum_volume)
rsd = np.std(rtum_volume)
r_error = sem(rami_drug["Tumor Volume (mm3)"])

In [10]:
ctum_volume = cap_drug["Tumor Volume (mm3)"]
cmean_numpy = np.mean(ctum_volume)
cmedian_numpy = np.median(ctum_volume)
cvar_numpy = np.var(ctum_volume)
csd_numpy = np.std(ctum_volume)
c_error = sem(cap_drug["Tumor Volume (mm3)"])

In [11]:
itum_volume = inf_drug["Tumor Volume (mm3)"]
imean_numpy = np.mean(itum_volume)
imedian_numpy = np.median(itum_volume)
ivar_numpy = np.var(itum_volume)
isd_numpy = np.std(itum_volume)
i_error = sem(inf_drug["Tumor Volume (mm3)"])

In [12]:
ptum_volume = pla_drug["Tumor Volume (mm3)"]
pmean_numpy = np.mean(ptum_volume)
pmedian_numpy = np.median(ptum_volume)
pvar_numpy = np.var(ptum_volume)
psd_numpy = np.std(ptum_volume)
p_error = sem(pla_drug["Tumor Volume (mm3)"])

In [13]:
ctum_volume = ceft_drug["Tumor Volume (mm3)"]
cmean_numpy = np.mean(ctum_volume)
cmedian_numpy = np.median(ctum_volume)
cvar_numpy = np.var(ctum_volume)
csd_numpy = np.std(ctum_volume)
c_error = sem(ceft_drug["Tumor Volume (mm3)"])

In [14]:
stum_volume = stel_drug["Tumor Volume (mm3)"]
smean_numpy = np.mean(stum_volume)
smedian_numpy = np.median(stum_volume)
svar_numpy = np.var(stum_volume)
ssd_numpy = np.std(stum_volume)
s_error = sem(stel_drug["Tumor Volume (mm3)"])

In [15]:
ztum_volume = zoni_drug["Tumor Volume (mm3)"]
zmean_numpy = np.mean(ztum_volume)
zmedian_numpy = np.median(ztum_volume)
zvar_numpy = np.var(ztum_volume)
zsd_numpy = np.std(ztum_volume)
z_error = sem(zoni_drug["Tumor Volume (mm3)"])

In [16]:
ktum_volume = ket_drug["Tumor Volume (mm3)"]
kmean_numpy = np.mean(ktum_volume)
kmedian_numpy = np.median(ktum_volume)
kvar_numpy = np.var(ktum_volume)
ksd_numpy = np.std(ktum_volume)
k_error = sem(ket_drug["Tumor Volume (mm3)"])

In [17]:
prtum_volume = prop_drug["Tumor Volume (mm3)"]
prmean_numpy = np.mean(prtum_volume)
prmedian_numpy = np.median(prtum_volume)
prvar_numpy = np.var(prtum_volume)
prsd_numpy = np.std(prtum_volume)
pr_error = sem(prop_drug["Tumor Volume (mm3)"])

In [18]:
ntum_volume = naft_drug["Tumor Volume (mm3)"]
nmean_numpy = np.mean(ntum_volume)
nmedian_numpy = np.median(ntum_volume)
nvar_numpy = np.var(ntum_volume)
nsd_numpy = np.std(ntum_volume)
n_error = sem(naft_drug["Tumor Volume (mm3)"])

In [19]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
combined_data.groupby("Drug Regimen").mean()

# This method produces everything in a single groupby function

Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,13.456522,19.965217,21.565217,40.675741,0.713043
Ceftamin,13.247191,27.398876,19.747191,52.591172,1.179775
Infubinol,16.230337,27.196629,18.174157,52.884795,0.960674
Ketapril,15.659574,27.861702,19.707447,55.235638,1.297872
Naftisol,12.0,27.166667,19.623656,54.331565,1.182796
Placebo,10.734807,27.928177,18.674033,54.033581,1.441989
Propriva,10.89441,27.043478,16.863354,52.322552,0.975155
Ramicane,10.684211,19.679825,21.425439,40.216745,0.548246
Stelasyn,12.78453,27.856354,19.226519,54.233149,0.872928
Zoniferol,12.598901,27.692308,19.368132,53.236507,1.230769


In [20]:
combined_data.groupby("Drug Regimen").median()

Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,16.5,20.5,20.0,41.557809,0.0
Ceftamin,12.0,28.0,20.0,51.776157,1.0
Infubinol,20.0,27.0,15.0,51.820584,1.0
Ketapril,18.0,28.0,20.0,53.698743,1.0
Naftisol,9.0,27.0,20.0,52.509285,1.0
Placebo,10.0,28.0,15.0,52.288934,1.0
Propriva,8.0,26.0,15.0,50.854632,1.0
Ramicane,9.0,19.0,20.0,40.673236,0.0
Stelasyn,14.0,28.0,20.0,52.431737,1.0
Zoniferol,12.5,28.0,15.0,51.818479,1.0


In [21]:
combined_data.groupby("Drug Regimen").var()

Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,59.620372,7.466034,206.928043,24.947764,0.72079
Ceftamin,65.147591,2.501016,204.031772,39.290177,1.402527
Infubinol,56.404272,4.769028,181.53447,43.128684,1.054942
Ketapril,36.236432,3.392536,196.839089,68.553577,1.942883
Naftisol,45.102703,2.247748,201.208951,66.173479,1.479919
Placebo,40.384837,3.378146,192.954266,61.168083,1.792449
Propriva,53.357531,2.791848,181.506211,42.35107,1.174379
Ramicane,35.362393,10.465318,203.796178,23.486704,0.477838
Stelasyn,63.036648,2.701473,191.620626,59.450562,0.944874
Zoniferol,33.479115,2.0153,206.918979,48.533355,1.559711


In [22]:
combined_data.groupby("Drug Regimen").std()

Unnamed: 0_level_0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,7.721423,2.732404,14.384994,4.994774,0.848993
Ceftamin,8.071406,1.58146,14.283969,6.268188,1.184283
Infubinol,7.510278,2.18381,13.473473,6.567243,1.027104
Ketapril,6.01967,1.841884,14.029935,8.279709,1.393873
Naftisol,6.715855,1.499249,14.184814,8.134708,1.216519
Placebo,6.354907,1.837973,13.890798,7.821003,1.338824
Propriva,7.304624,1.670882,13.472424,6.50777,1.083688
Ramicane,5.946629,3.235014,14.27572,4.846308,0.691259
Stelasyn,7.939562,1.643616,13.84271,7.710419,0.972046
Zoniferol,5.786114,1.419612,14.384679,6.966589,1.248884


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
