## Observations and Insights 

In [5]:

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from matplotlib.pyplot import figure
import numpy as np
import random 

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
common_column = mouse_metadata.columns & study_results.columns
master_data = pd.merge(mouse_metadata, study_results, on = common_column[0])
master_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/Mouse_metadata.csv'

In [3]:
# Checking the number of mice in the DataFrame.
Mouse_num = Mouse_data.groupby("Mouse ID")
Mice_num =len(Mouse_num.count())
Mice_num

NameError: name 'Mouse_data' is not defined

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
Mouse_time = Mouse_data.groupby(["Mouse ID","Timepoint"])
Mice = Mouse_time.count()
Mice = Mice.sort_values("Drug Regimen")
Mice.tail()

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 
Mouse_data.loc[Mouse_data["Mouse ID"]== "g989"]

In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
Data_mouse = Mouse_data
rmv = Data_mouse[Data_mouse["Mouse ID"]== "g989"].index.values
Data_mouse = Data_mouse.drop(rmv)
Data_mouse


In [6]:
# Checking the number of mice in the clean DataFrame.
Mouse_num_new = Data_mouse.groupby("Mouse ID")
Mice_num_new =len(Mouse_num_new.count())
Mice_num_new

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straightforward, creating multiple series and putting them all together at the end.
ss = Data_mouse.groupby("Drug Regimen")
mean_drug = pd.Series(ss["Tumor Volume (mm3)"].mean(), name = "Mean")
median_drug = pd.Series(ss["Tumor Volume (mm3)"].median(), name = "Median")
var_drug = pd.Series(ss["Tumor Volume (mm3)"].var(), name = "Variance")
std_drug = pd.Series(ss["Tumor Volume (mm3)"].std(), name = "STD Dev")
sem_drug = pd.Series(ss["Tumor Volume (mm3)"].sem(), name = "SEM")
sum_stat = pd.concat([mean_drug, median_drug, var_drug, std_drug, sem_drug], axis = 1)
sum_stat

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.
ss2 = Data_mouse.groupby("Drug Regimen").agg(["mean","median","var","std", "sem"])
ss2["Tumor Volume (mm3)"]

## Bar Plots

In [9]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.
drug_count = Data_mouse.groupby("Drug Regimen")
mcount = drug_count["Mouse ID"].nunique()
mcount.plot.bar(rot = 90, title = "Rats Per Drug", y = "Rat Count")

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.
ax = mcount
ax = ax.reset_index()
xax = ax["Drug Regimen"]
yax = ax["Mouse ID"]

plt.bar(xax, yax, width = .5)
plt.xticks(rotation = 90)
plt.title("Rats per Drug")
plt.xlabel("Drug Regime")
plt.ylabel("Rat Count")

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_count = Data_mouse.groupby("Sex")
scount = sex_count["Mouse ID"].nunique()
scount
scount.plot.pie( title = "Sex Distribution")

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
pc = scount
pc = pc.reset_index()
pxax = pc["Mouse ID"]

plt.pie(pxax, labels = ["Female","Male"])
plt.title("Sex Distribution")

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Start by getting the last (greatest) timepoint for each mouse
Haf_dat = Data_mouse[Data_mouse["Drug Regimen"] == "Capomulin"]
Haf_dat = Haf_dat.append(Data_mouse[Data_mouse["Drug Regimen"] == "Ramicane"])
Haf_dat = Haf_dat.append(Data_mouse[Data_mouse["Drug Regimen"] == "Infubinol"])
Haf_dat = Haf_dat.append(Data_mouse[Data_mouse["Drug Regimen"] == "Ceftamin"])
Haf_dat

vol = Haf_dat.groupby(["Mouse ID","Timepoint"])
vol2 = vol["Tumor Volume (mm3)"].max()
vol3 = vol2.reset_index()
vol4 = vol3[vol3["Timepoint"] == 45]
vol4
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
Data_test = Data_mouse
Test_mouse = pd.merge(Data_test, vol4, on = ("Mouse ID","Timepoint"), how = "left", suffixes = ("","(end)"))
Test_mouse

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
D_rug = ax["Drug Regimen"].values.tolist()
D_rug
# Create empty list to fill with tumor vol data (for plotting)
Tum_data = Test_mouse["Tumor Volume (mm3)"]
Tum_data

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
Quart = Tum_data.quantile([.25,.5,.75])
Quart
Top_bar = Quart[.75]
Low_bar = Quart[.25]
IQR = Top_bar - Low_bar

Top_out = Top_bar + 1.5*IQR
Low_out = Low_bar - 1.5*IQR
print(f"{Top_out} - {Low_out}")
    
Tum_data.sort_values()
#Many outliers

# Locate the rows which contain mice on each drug and get the tumor volumes
Box_cap = Test_mouse[Test_mouse["Drug Regimen"] == "Capomulin"]
Box_cap = Box_cap[["Drug Regimen","Tumor Volume (mm3)(end)"]]
Box_cap = Box_cap.dropna()
Box_cap

Box_ram = Test_mouse[Test_mouse["Drug Regimen"] == "Ramicane"]
Box_ram = Box_ram[["Drug Regimen","Tumor Volume (mm3)(end)"]]
Box_ram = Box_ram.dropna()
Box_ram

Box_inf = Test_mouse[Test_mouse["Drug Regimen"] == "Infubinol"]
Box_inf = Box_inf[["Drug Regimen","Tumor Volume (mm3)(end)"]]
Box_inf = Box_inf.dropna()
Box_inf

Box_cef = Test_mouse[Test_mouse["Drug Regimen"] == "Ceftamin"]
Box_cef = Box_cef[["Drug Regimen","Tumor Volume (mm3)(end)"]]
Box_cef = Box_cef.dropna()
Box_cef

# add subset 
Box_dat = pd.merge(Box_cap, Box_ram, on = ("Drug Regimen", "Tumor Volume (mm3)(end)"), how = "outer")
Box_dat = pd.merge(Box_dat, Box_inf, on = ("Drug Regimen", "Tumor Volume (mm3)(end)"), how = "outer")
Box_dat = pd.merge(Box_dat, Box_cef, on = ("Drug Regimen", "Tumor Volume (mm3)(end)"), how = "outer")
Box_dat

# Determine outliers using upper and lower bounds
Qt = Box_dat["Tumor Volume (mm3)(end)"].quantile([.25,.5,.75])
q3 = Qt[.75]
q1 = Qt[.25]
up_b = q3 + 1.5*(q3-q1)
down_b = q1 - 1.5*(q3-q1)
print(f"{up_b} - {down_b}")
Box_dat.sort_values(by = "Tumor Volume (mm3)(end)")
#No Outliers

In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, g1 = plt.subplots()
g1.set_ylabel("Tumor Volume (mm^3)(end)")
g1.boxplot([Box_cap["Tumor Volume (mm3)(end)"], Box_ram["Tumor Volume (mm3)(end)"], Box_inf["Tumor Volume (mm3)(end)"], Box_cef["Tumor Volume (mm3)(end)"]])
g1.set_xticklabels(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
x_rat = Data_mouse[Data_mouse["Mouse ID"] == "s185"]
x_rat = x_rat[["Timepoint", "Tumor Volume (mm3)"]]
x_rat
plt.plot(x_rat["Timepoint"], x_rat["Tumor Volume (mm3)"])
plt.ylabel("Tumor size (mm^3)")
plt.xlabel("Timepoint")
plt.title("Capomulin effect on s185")

In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capo = Data_mouse[Data_mouse["Drug Regimen"] == "Capomulin"]
capo1 = capo.groupby("Mouse ID")
capo2 = capo1.mean()
capo3 = capo2[["Weight (g)", "Tumor Volume (mm3)"]]
capo3
plt.scatter(capo3["Weight (g)"], capo3["Tumor Volume (mm3)"])
plt.title("Weight vs. Average Tumor Volume for Capomulin")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm^3)")



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
capo_corr = st.pearsonr(capo3["Weight (g)"], capo3["Tumor Volume (mm3)"])[0]
capo_corr
plt.scatter(capo3["Weight (g)"], capo3["Tumor Volume (mm3)"])
m, b = np.polyfit(capo3["Weight (g)"], capo3["Tumor Volume (mm3)"],1)
plt.plot(capo3["Weight (g)"], m*capo3["Weight (g)"] + b)

In [None]:
#Observations
#1. Infubinol and Ceftamin did not help, and had a positive correlation with tumor size,
#   meaning they were worse than leaving the tumor alone alone 

#2. Weight / Average Tumor size had a positive correlation

#3. The excluded rat, g989, had two entries for many of the early timepoints, but had different tumor sizes for those
#   entriesmeaning the potential for human error was low as they were too far apart. Its possible that
#  the duplicate entries were  two different rats, entered as the same rat in error 