In [1]:
# Make plots interactive and enable inline backend
%matplotlib inline

In [2]:
# Import dependencies
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# View data sets to read
os.listdir("data")

['clinicaltrial_data.csv', 'mouse_drug_data.csv']

In [4]:
# Read data sets into Pandas DataFrames
clinical_data = pd.read_csv("data/clinicaltrial_data.csv")
mouse_data = pd.read_csv("data/mouse_drug_data.csv")

# Preview data frames
display(clinical_data.head(3))
mouse_data.head(3)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0


Unnamed: 0,Mouse ID,Drug
0,f234,Stelasyn
1,x402,Stelasyn
2,a492,Stelasyn


In [5]:
# Check number of rows and nulls in both DataFrames
display(clinical_data.count())
mouse_data.count()

Mouse ID              1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

Mouse ID    250
Drug        250
dtype: int64

In [6]:
# Merge the two original DataFrames into a new single DataFrame containing all research data
all_data = pd.merge(clinical_data, mouse_data, how="left", on="Mouse ID")

# Preview DataFrame
all_data.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug
0,b128,0,45.0,0,Capomulin
1,f932,0,45.0,0,Ketapril
2,g107,0,45.0,0,Ketapril
3,a457,0,45.0,0,Ketapril
4,c819,0,45.0,0,Ketapril


### Create a scatter plot that shows how the tumor volume changes over time for each treatment.

In [7]:
# Calculate means of "Tumor Volume (mm3)" grouped by "Drug" and "Timepoint" 
tumor_vol_means = all_data.groupby(["Drug", "Timepoint"])["Tumor Volume (mm3)"].mean()

# Convert to, and flatten, DataFrame
tumor_vol_means = pd.DataFrame(tumor_vol_means).reset_index()

# Preview DataFrame
tumor_vol_means.head()

Unnamed: 0,Drug,Timepoint,Tumor Volume (mm3)
0,Capomulin,0,45.0
1,Capomulin,5,44.266086
2,Capomulin,10,43.084291
3,Capomulin,15,42.064317
4,Capomulin,20,40.716325


In [8]:
# Calculate standard error of the means for "Tumor Volume (mm3)" grouped by "Drug" and "Timepoint" 
tumor_vol_sem = all_data.groupby(["Drug", "Timepoint"])["Tumor Volume (mm3)"].sem()

# Convert to, and flatten, DataFrame
tumor_vol_sem = pd.DataFrame(tumor_vol_sem).reset_index()

# Preview DataFrame
tumor_vol_sem.head()

Unnamed: 0,Drug,Timepoint,Tumor Volume (mm3)
0,Capomulin,0,0.0
1,Capomulin,5,0.448593
2,Capomulin,10,0.702684
3,Capomulin,15,0.838617
4,Capomulin,20,0.909731


In [9]:
# Get "Tumor Volume (mm3)" values for each individual treatment to be plotted
capomulin = tumor_vol_means.loc[tumor_vol_means["Drug"] == "Capomulin"]["Tumor Volume (mm3)"]
infubinol = tumor_vol_means.loc[tumor_vol_means["Drug"] == "Infubinol"]["Tumor Volume (mm3)"]
ketapril = tumor_vol_means.loc[tumor_vol_means["Drug"] == "Ketapril"]["Tumor Volume (mm3)"]
placebo = tumor_vol_means.loc[tumor_vol_means["Drug"] == "Placebo"]["Tumor Volume (mm3)"]

# Get the highest "Tumor Volume (mm3)" value out of the four treatments to be plotted
ymax = tumor_vol_means.loc[tumor_vol_means['Drug'].isin(["Capomulin", "Infubinol", "Ketapril", "Placebo"])]
# Get max y-axis value
ymax.loc[ymax["Tumor Volume (mm3)"].idxmax()]

Drug                  Ketapril
Timepoint                   45
Tumor Volume (mm3)      70.663
Name: 39, dtype: object