In [None]:

In [ ]:

'''
Observations and Insights:
#1 The testing was pretty even in selecting genders for the experiment
#2 There really was no outlier in each of the four drug regiments we took a look at
#3 The average tumor volume has a positive correlation with the weight of the mouse that were on the Capomulin regimen
'''

In [1]:

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_data = pd.merge(mouse_metadata, study_results, how = "left", on = "Mouse ID")
# Display the data table for preview
merge_data.head()

Out[1]:
	Mouse ID 	Drug Regimen 	Sex 	Age_months 	Weight (g) 	Timepoint 	Tumor Volume (mm3) 	Metastatic Sites
0 	k403 	Ramicane 	Male 	21 	16 	0 	45.000000 	0
1 	k403 	Ramicane 	Male 	21 	16 	5 	38.825898 	0
2 	k403 	Ramicane 	Male 	21 	16 	10 	35.014271 	1
3 	k403 	Ramicane 	Male 	21 	16 	15 	34.223992 	1
4 	k403 	Ramicane 	Male 	21 	16 	20 	32.997729 	1
In [2]:

# Checking the number of mice.
merge_data["Mouse ID"].nunique()

Out[2]:

249

In [3]:

# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
timepoint_count = merge_data["Timepoint"].nunique()
duplicate_mice = pd.concat(x for _, x in merge_data.groupby("Mouse ID") if len(x) > timepoint_count)
duplicate_mice

Out[3]:
	Mouse ID 	Drug Regimen 	Sex 	Age_months 	Weight (g) 	Timepoint 	Tumor Volume (mm3) 	Metastatic Sites
908 	g989 	Propriva 	Female 	21 	26 	0 	45.000000 	0
909 	g989 	Propriva 	Female 	21 	26 	0 	45.000000 	0
910 	g989 	Propriva 	Female 	21 	26 	5 	48.786801 	0
911 	g989 	Propriva 	Female 	21 	26 	5 	47.570392 	0
912 	g989 	Propriva 	Female 	21 	26 	10 	51.745156 	0
913 	g989 	Propriva 	Female 	21 	26 	10 	49.880528 	0
914 	g989 	Propriva 	Female 	21 	26 	15 	51.325852 	1
915 	g989 	Propriva 	Female 	21 	26 	15 	53.442020 	0
916 	g989 	Propriva 	Female 	21 	26 	20 	55.326122 	1
917 	g989 	Propriva 	Female 	21 	26 	20 	54.657650 	1
918 	g989 	Propriva 	Female 	21 	26 	25 	56.045564 	1
919 	g989 	Propriva 	Female 	21 	26 	30 	59.082294 	1
920 	g989 	Propriva 	Female 	21 	26 	35 	62.570880 	2
In [4]:

# Optional: Get all the data for the duplicate mouse ID. 
dup_data = merge_data[merge_data.duplicated(["Mouse ID", "Timepoint"], keep = "first")]
dup_data

Out[4]:
	Mouse ID 	Drug Regimen 	Sex 	Age_months 	Weight (g) 	Timepoint 	Tumor Volume (mm3) 	Metastatic Sites
909 	g989 	Propriva 	Female 	21 	26 	0 	45.000000 	0
911 	g989 	Propriva 	Female 	21 	26 	5 	47.570392 	0
913 	g989 	Propriva 	Female 	21 	26 	10 	49.880528 	0
915 	g989 	Propriva 	Female 	21 	26 	15 	53.442020 	0
917 	g989 	Propriva 	Female 	21 	26 	20 	54.657650 	1
In [5]:

# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = merge_data.drop_duplicates(["Mouse ID", "Timepoint"], keep = "first")
clean_data.head()

Out[5]:
	Mouse ID 	Drug Regimen 	Sex 	Age_months 	Weight (g) 	Timepoint 	Tumor Volume (mm3) 	Metastatic Sites
0 	k403 	Ramicane 	Male 	21 	16 	0 	45.000000 	0
1 	k403 	Ramicane 	Male 	21 	16 	5 	38.825898 	0
2 	k403 	Ramicane 	Male 	21 	16 	10 	35.014271 	1
3 	k403 	Ramicane 	Male 	21 	16 	15 	34.223992 	1
4 	k403 	Ramicane 	Male 	21 	16 	20 	32.997729 	1
In [6]:

# Checking the number of mice in the clean DataFrame.
clean_data["Mouse ID"].nunique()

Out[6]:

249

Summary Statistics
In [7]:

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug = clean_data.groupby("Drug Regimen")

mean = drug["Tumor Volume (mm3)"].mean()
median = drug["Tumor Volume (mm3)"].median()
var = drug["Tumor Volume (mm3)"].var()
std = drug["Tumor Volume (mm3)"].std()
sem = drug["Tumor Volume (mm3)"].sem()

#creating the dataframe for the mean, median, variance, standard deviation, and SEM
drug_df = pd.DataFrame({"Mean" : mean,
                        "Median" : median,
                        "Variance" : var,
                        "Standard Deviation" : std,
                        "SEM" : sem})
drug_df

Out[7]:
	Mean 	Median 	Variance 	Standard Deviation 	SEM
Drug Regimen 					
Capomulin 	40.675741 	41.557809 	24.947764 	4.994774 	0.329346
Ceftamin 	52.591172 	51.776157 	39.290177 	6.268188 	0.469821
Infubinol 	52.884795 	51.820584 	43.128684 	6.567243 	0.492236
Ketapril 	55.235638 	53.698743 	68.553577 	8.279709 	0.603860
Naftisol 	54.331565 	52.509285 	66.173479 	8.134708 	0.596466
Placebo 	54.033581 	52.288934 	61.168083 	7.821003 	0.581331
Propriva 	52.393463 	50.909965 	43.138803 	6.568014 	0.525862
Ramicane 	40.216745 	40.673236 	23.486704 	4.846308 	0.320955
Stelasyn 	54.233149 	52.431737 	59.450562 	7.710419 	0.573111
Zoniferol 	53.236507 	51.818479 	48.533355 	6.966589 	0.516398
Bar and Pie Charts
In [8]:

# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
drug = pd.DataFrame(drug.count()).reset_index()
drug = drug.rename(columns = {"Mouse ID" : "Mice"})

In [9]:

drug.plot.bar(x = "Drug Regimen", y = "Mice", figsize = (10,3))
plt.title("Total Mice per Treatment")
plt.ylabel("Total Mice")
plt.show()

In [10]:

# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = drug["Drug Regimen"].tolist()
y_axis = drug["Mice"].tolist()

plt.figure(figsize = (10,3))
plt.bar(x_axis, y_axis, color = "blue", alpha = 0.5, width = .5, align = "center")
plt.title("Total Mice per Treatment")
plt.xlabel("Drug Reginmen")
plt.xticks(rotation = "vertical")
plt.ylabel("Count")
plt.legend(["Mice"], loc = "upper right")
plt.show()

In [11]:

# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_data = pd.DataFrame(clean_data.groupby(["Sex"]).count()).reset_index()
gender_data = gender_data[["Sex", "Mouse ID"]]
gender_data = gender_data.rename(columns = {"Mouse ID" : "Mice"})
gender_data.plot.pie(y = "Mice", autopct = "%1.1f%%", startangle = 45, labels = gender_data["Sex"], legend = False)
plt.show()

In [12]:

gender_data

Out[12]:
	Sex 	Mice
0 	Female 	930
1 	Male 	958
In [13]:

# Generate a pie plot showing the distribution of female versus male mice using pyplot
count = gender_data["Mice"].tolist()
labels = gender_data["Sex"].tolist()
colors = ["blue", "orange"]

plt.pie(count, labels = labels, colors = colors,  autopct = "%1.1f%%", startangle = 45)
plt.show()

Quartiles, Outliers and Boxplots
In [14]:

# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
greatest_tp = clean_data.loc[clean_data["Drug Regimen"].isin(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])]
# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
greatest_tp = greatest_tp.loc[greatest_tp["Timepoint"] == 45]
greatest_tp.reset_index()

Out[14]:
	index 	Mouse ID 	Drug Regimen 	Sex 	Age_months 	Weight (g) 	Timepoint 	Tumor Volume (mm3) 	Metastatic Sites
0 	9 	k403 	Ramicane 	Male 	21 	16 	45 	22.050126 	1
1 	19 	s185 	Capomulin 	Female 	3 	17 	45 	23.343598 	1
2 	29 	x401 	Capomulin 	Female 	16 	15 	45 	28.484033 	0
3 	39 	m601 	Capomulin 	Male 	22 	17 	45 	28.430964 	1
4 	49 	g791 	Ramicane 	Male 	11 	16 	45 	29.128472 	1
... 	... 	... 	... 	... 	... 	... 	... 	... 	...
58 	1797 	x822 	Ceftamin 	Male 	3 	29 	45 	61.386660 	3
59 	1813 	y163 	Infubinol 	Female 	17 	27 	45 	67.685569 	3
60 	1845 	y769 	Ceftamin 	Female 	6 	27 	45 	68.594745 	4
61 	1855 	y865 	Ceftamin 	Male 	23 	26 	45 	64.729837 	3
62 	1872 	z581 	Infubinol 	Female 	24 	25 	45 	62.754451 	3

63 rows × 9 columns
In [15]:

# Put treatments into a list for for loop (and later for plot labels)
treatment = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
for drug in treatment:
    treatment_vol = greatest_tp["Tumor Volume (mm3)"].loc[greatest_tp["Drug Regimen"] == drug]
    
    # add subset 
    tumor_vol.append(treatment_vol)

    # Determine outliers using upper and lower bounds
    quartiles = treatment_vol.quantile([.25,.5,.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    IQR = upper_quartile - lower_quartile
    print(f'For {drug}, Interquartile Range (IQR) is {round(IQR,2)}')
    lower_bound = lower_quartile - (1.5 * IQR)
    upper_bound = upper_quartile + (1.5 * IQR)
    print(f"For {drug}, values below {round(lower_bound,2)} could be outliers")
    print(f"For {drug}, values above {round(upper_bound,2)} could be outliers\n")

For Capomulin, Interquartile Range (IQR) is 7.78
For Capomulin, values below 20.7 could be outliers
For Capomulin, values above 51.83 could be outliers

For Ramicane, Interquartile Range (IQR) is 7.53
For Ramicane, values below 19.69 could be outliers
For Ramicane, values above 49.8 could be outliers

For Infubinol, Interquartile Range (IQR) is 4.93
For Infubinol, values below 55.36 could be outliers
For Infubinol, values above 75.08 could be outliers

For Ceftamin, Interquartile Range (IQR) is 6.09
For Ceftamin, values below 52.29 could be outliers
For Ceftamin, values above 76.67 could be outliers

In [16]:

# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volumes per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Tumor Volume (mm3)")
ax1.boxplot(tumor_vol)

plt.show()

Line and Scatter Plots
In [17]:

# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
mouse_data = clean_data.loc[clean_data["Drug Regimen"] == "Capomulin"]
single_mouse_data = mouse_data.loc[mouse_data["Mouse ID"] == "m601"]

plt.plot(single_mouse_data["Timepoint"], single_mouse_data["Tumor Volume (mm3)"], marker = "o")
plt.title("Mouse: M601's Tumor Volume Over Time")
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [18]:

# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
cap_data = greatest_tp[greatest_tp["Drug Regimen"].isin(["Capomulin"])]
cap_data = cap_data.reset_index()
cap_data = cap_data.loc[:, ["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]
avg_cap_data = pd.DataFrame(cap_data.groupby(["Mouse ID", "Weight (g)"])["Tumor Volume (mm3)"].mean()).reset_index()
avg_cap_data = avg_cap_data.rename(columns = {"Tumor Volume (mm3)" : "Avg Tumor Volume (mm3)"})
avg_cap_data.set_index("Mouse ID")

plt.scatter(avg_cap_data["Weight (g)"], avg_cap_data["Avg Tumor Volume (mm3)"])
plt.title("Avg Tumor Volume vs Weight")
plt.xlabel("Weight (g)")
plt.ylabel("Avg Tumor Volume (mm3)")
plt.show()

Correlation and Regression
In [19]:

# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
x = avg_cap_data["Weight (g)"]
y = avg_cap_data["Avg Tumor Volume (mm3)"]
correlation = st.pearsonr(x, y)
correlation = round(correlation[0], 2)
print("The correlation between the weight and average tumor volume is " + str(correlation))

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x, y)
linear_reg = x * slope + intercept
line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))

plt.scatter(x, y)
plt.plot(x, linear_reg, "r-")
plt.annotate(line_eq, (20, 25), fontsize = 14, color = "red")
plt.title("Avg Tumor Volume vs Weight")
plt.xlabel("Weight (g)")
plt.ylabel("Avg Tumor Volume (mm3)")

plt.show()

The correlation between the weight and average tumor volume is 0.85

In [20]:

plt.plot(single_mouse_data["Timepoint"], single_mouse_data["Tumor Volume (mm3)"], marker = "o")
plt.title("Mouse: M601's Tumor Volume Over Time")
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [ ]: