## Observations and Insights

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
master_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")
master_data.head()

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

tumor_volume = master_data["Tumor Volume (mm3)"]

#create list of all the drugs
drugs = master_data["Drug Regimen"].unique()

#create separate dataframe for each drug
ramicane = master_data[master_data["Drug Regimen"]=="Ramicane"]["Tumor Volume (mm3)"]
capomulin = master_data[master_data["Drug Regimen"]=="Capomulin"]["Tumor Volume (mm3)"]
infubinol = master_data[master_data["Drug Regimen"]=="Infubinol"]["Tumor Volume (mm3)"]
placebo = master_data[master_data["Drug Regimen"]=="Placebo"]["Tumor Volume (mm3)"]
ceftamin = master_data[master_data["Drug Regimen"]=="Ceftamin"]["Tumor Volume (mm3)"]
stelasyn = master_data[master_data["Drug Regimen"]=="Stelasyn"]["Tumor Volume (mm3)"]
zoniferol = master_data[master_data["Drug Regimen"]=="Zoniferol"]["Tumor Volume (mm3)"]
ketapril = master_data[master_data["Drug Regimen"]=="Ketapril"]["Tumor Volume (mm3)"]
propriva = master_data[master_data["Drug Regimen"]=="Propriva"]["Tumor Volume (mm3)"]
naftisol = master_data[master_data["Drug Regimen"]=="Naftisol"]["Tumor Volume (mm3)"]

#calculate variance and standard deviation

var_ramicane = np.var(ramicane,ddof = 0)
var_capomulin = np.var(capomulin,ddof = 0)
var_infubinol = np.var(infubinol,ddof = 0)
var_placebo = np.var(placebo,ddof = 0)
var_ceftamin = np.var(ceftamin,ddof = 0)
var_stelasyn = np.var(stelasyn,ddof = 0)
var_zoniferol = np.var(zoniferol,ddof = 0)
var_ketapril = np.var(ketapril,ddof = 0)
var_propriva = np.var(propriva,ddof = 0)
var_naftisol = np.var(naftisol,ddof = 0)

sd_ramicane = np.std(ramicane,ddof = 0)
sd_capomulin = np.std(capomulin,ddof = 0)
sd_infubinol = np.std(infubinol,ddof = 0)
sd_placebo = np.std(placebo,ddof = 0)
sd_ceftamin = np.std(ceftamin,ddof = 0)
sd_stelasyn = np.std(stelasyn,ddof = 0)
sd_zoniferol = np.std(zoniferol,ddof = 0)
sd_ketapril = np.std(ketapril,ddof = 0)
sd_propriva = np.std(propriva,ddof = 0)
sd_naftisol = np.std(naftisol,ddof = 0)

#build summary table as dataframe
summary_table = pd.DataFrame({"Drug Regimen": ['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin','Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'], 
                             "Mean Tumor Volume (mm3)": [ramicane.mean(), capomulin.mean(), infubinol.mean(), placebo.mean(), ceftamin.mean(), stelasyn.mean(), zoniferol.mean(), ketapril.mean(), propriva.mean(), naftisol.mean()],
                            "Median Tumor Volume (mm3)": [ramicane.median(), capomulin.median(), infubinol.median(), placebo.median(), ceftamin.median(), stelasyn.median(), zoniferol.median(), ketapril.median(), propriva.median(), naftisol.median()],
                             "Standard Deviation": [sd_ramicane, sd_capomulin, sd_infubinol, sd_placebo, sd_ceftamin, sd_stelasyn, sd_zoniferol, sd_ketapril, sd_propriva, sd_naftisol],
                             "Variance": [var_ramicane, var_capomulin, var_infubinol, var_placebo, var_ceftamin, var_stelasyn, var_zoniferol, var_ketapril, var_propriva, var_naftisol]
                             })

summary_table

## Bar plots

In [None]:
num_data = [ramicane.count(), capomulin.count(), infubinol.count(), placebo.count(), ceftamin.count(), stelasyn.count(), zoniferol.count(), ketapril.count(), propriva.count(), naftisol.count()]
x_axis = np.arange(len(num_data))

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin','Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'], rotation="vertical")

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

plt.figure(figsize=(20,3))
plt.bar(x_axis, num_data, color='r', alpha=0.5, align="center")
plt.xticks(tick_locations, drugs, rotation="vertical")

plt.title("Sample Size by Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Test Subjects")

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

plt.bar(x_axis, num_data, align="center")

plt.title("Sample Size by Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Test Subjects")

## Pie plots

In [None]:
male = master_data[master_data["Sex"]=="Male"].count()["Mouse ID"]
female = master_data[master_data["Sex"]=="Female"].count()["Mouse ID"]
total = male + female

print(f"There are {female} female and {male} male mice out of the total population of {total}.")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sexes = [958, 935]
labels = ["Male", "Female"]
colors = ["red", "lightskyblue"]
explode = (0.1, 0)

plt.pie(sexes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow=False, startangle=140)

plt.axis("equal")

plt.title("Distribution of Male vs. Female")


## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

#create separate dataframe by regimen
ramicane_df = master_data[master_data["Drug Regimen"]=="Ramicane"]
capomulin_df = master_data[master_data["Drug Regimen"]=="Capomulin"]
infubinol_df = master_data[master_data["Drug Regimen"]=="Infubinol"]
ceftamin_df = master_data[master_data["Drug Regimen"]=="Ceftamin"]

#return tumor volume for only the final timepoint
ram_final = pd.DataFrame(ramicane_df[ramicane_df["Timepoint"]==45]["Tumor Volume (mm3)"])
cap_final = pd.DataFrame(capomulin_df[capomulin_df["Timepoint"]==45]["Tumor Volume (mm3)"])
inf_final = pd.DataFrame(infubinol_df[infubinol_df["Timepoint"]==45]["Tumor Volume (mm3)"])
cef_final = pd.DataFrame(ceftamin_df[ceftamin_df["Timepoint"]==45]["Tumor Volume (mm3)"])

#combine into one dataframe
final_df = pd.DataFrame({"index": [" "],
                         "Tumor Volume (mm3)": [" "]})
final_df.append(ram_final, ignore_index=True)
final_df.append(cap_final, ignore_index=True)
final_df.append(inf_final, ignore_index=True)
final_df.append(cef_final, ignore_index=True)


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#create dataframe of only capomulin subjects
capomulin_df = master_data[master_data["Drug Regimen"]=="Capomulin"]

#create dataframe of only subject s185 from capomulin dataframe
capomulin_s185 = capomulin_df[capomulin_df["Mouse ID"]=="s185"]

#create lists of timepoint and tumor volume from the s185 dataframe
s185_time = capomulin_s185["Timepoint"]
s185_tv = capomulin_s185["Tumor Volume (mm3)"]

#plot the results
s185_plot = plt.plot(s185_time, s185_tv, marker="o", label="Test Subject s185")

#label and add legend
plt.legend(loc="lower left")
plt.title("Time vs. Tumor Volume")
plt.xlabel("Time")
plt.ylabel("Tumor Volume")

plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

#create dataframes of only capomulin subjects by Mouse ID and Tumor Volume
capomulin_id = master_data[master_data["Drug Regimen"]=="Capomulin"]["Mouse ID"]
capomulin_tv =  master_data[master_data["Drug Regimen"]=="Capomulin"]["Tumor Volume (mm3)"]
capomulin_weight = master_data[master_data["Drug Regimen"]=="Capomulin"]["Weight (g)"]

#merge into one dataframe
tv_df = pd.merge(capomulin_id, capomulin_tv, left_index=True, right_index=True)
weight_df = pd.merge(capomulin_id, capomulin_weight, left_index=True, right_index=True)

#find average for each Mouse using grouby
grouped_tv = tv_df.groupby(["Mouse ID"]).mean()
grouped_weight = weight_df.groupby(["Mouse ID"]).mean()

#plot average Tumor Volume for each mouse
weight = grouped_weight["Weight (g)"]
tumor_volume = grouped_tv["Tumor Volume (mm3)"]

plt.scatter(weight, tumor_volume, marker="o", )

#add labels
plt.title("Mouse Weight vs. Avg Tumor Volume")
plt.xlabel("Mouse Weight")
plt.ylabel("Avg Tumor Volume")

#add linear regression model for mouse weight and average tumor volume for the Capomulin regimen
(slope, intercept, rvalue, pvalue, stderr) = linregress(weight, tumor_volume)
tumor_volume = weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.plot(weight,tumor_volume,"r-")
#plt.annotate(line_eq,(6,10),fontsize=15,color="red")

plt.show()

In [None]:
# Calculate the correlation coefficient 
print(f"The correlation coefficient between mouse weight and average tumor volume is {round(st.pearsonr(weight,tumor_volume)[0],2)}.")

