## Observations and Insights 

In [724]:
%matplotlib notebook

In [765]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
pyma_df_raw = pd.merge(mouse_metadata, study_results)
# Display the data table for preview
pyma_df_raw.head(20)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [726]:
# Checking the number of mice.
mice = len(pyma_df_raw['Mouse ID'].unique())
mice

249

In [727]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicates = pyma_df_raw.duplicated(subset = ['Mouse ID', 'Timepoint'], keep = False)
pyma_df_raw['Duplicates'] = duplicates
pyma_df_raw

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Duplicates
0,k403,Ramicane,Male,21,16,0,45.000000,0,False
1,k403,Ramicane,Male,21,16,5,38.825898,0,False
2,k403,Ramicane,Male,21,16,10,35.014271,1,False
3,k403,Ramicane,Male,21,16,15,34.223992,1,False
4,k403,Ramicane,Male,21,16,20,32.997729,1,False
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,False
1889,z969,Naftisol,Male,9,30,30,65.841013,3,False
1890,z969,Naftisol,Male,9,30,35,69.176246,4,False
1891,z969,Naftisol,Male,9,30,40,70.314904,4,False


In [728]:
# Optional: Get all the data for the duplicate mouse ID. 
pyma_df_duplicates = pyma_df_raw[pyma_df_raw['Duplicates'] == True]
duplicate_ID = pyma_df_duplicates['Mouse ID'].unique()
pyma_df_dup_ID = pyma_df_raw[pyma_df_raw['Mouse ID'] == 'g989']
pyma_df_dup_ID

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Duplicates
908,g989,Propriva,Female,21,26,0,45.0,0,True
909,g989,Propriva,Female,21,26,0,45.0,0,True
910,g989,Propriva,Female,21,26,5,48.786801,0,True
911,g989,Propriva,Female,21,26,5,47.570392,0,True
912,g989,Propriva,Female,21,26,10,51.745156,0,True
913,g989,Propriva,Female,21,26,10,49.880528,0,True
914,g989,Propriva,Female,21,26,15,51.325852,1,True
915,g989,Propriva,Female,21,26,15,53.44202,0,True
916,g989,Propriva,Female,21,26,20,55.326122,1,True
917,g989,Propriva,Female,21,26,20,54.65765,1,True


In [729]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
pyma_df_index = pyma_df_raw.set_index('Mouse ID')
pyma_df_clean = pyma_df_index.drop(index = duplicate_ID)
pyma_df = pyma_df_clean.reset_index()
pyma_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,Duplicates
0,k403,Ramicane,Male,21,16,0,45.000000,0,False
1,k403,Ramicane,Male,21,16,5,38.825898,0,False
2,k403,Ramicane,Male,21,16,10,35.014271,1,False
3,k403,Ramicane,Male,21,16,15,34.223992,1,False
4,k403,Ramicane,Male,21,16,20,32.997729,1,False
...,...,...,...,...,...,...,...,...,...
1875,z969,Naftisol,Male,9,30,25,63.145652,2,False
1876,z969,Naftisol,Male,9,30,30,65.841013,3,False
1877,z969,Naftisol,Male,9,30,35,69.176246,4,False
1878,z969,Naftisol,Male,9,30,40,70.314904,4,False


In [730]:
# Checking the number of mice in the clean DataFrame.
mice_re = len(pyma_df['Mouse ID'].unique())
mice_re

248

## Summary Statistics

In [731]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
pyma_gb = pyma_df.groupby('Drug Regimen')
tumor_mean = pyma_gb['Tumor Volume (mm3)'].mean()
tumor_median = pyma_gb['Tumor Volume (mm3)'].median()
tumor_std = pyma_gb['Tumor Volume (mm3)'].std()
tumor_var = pyma_gb['Tumor Volume (mm3)'].var()
tumor_sem = pyma_gb['Tumor Volume (mm3)'].sem()
pyma_sum_df = pd.merge(tumor_mean, tumor_median, on='Drug Regimen')
pyma_sum_df = pyma_sum_df.rename(columns={'Tumor Volume (mm3)_x':'Mean', 'Tumor Volume (mm3)_y':'Median'})
pyma_sum_df = pd.merge(pyma_sum_df, tumor_var, on='Drug Regimen')
pyma_sum_df = pyma_sum_df.rename(columns={'Tumor Volume (mm3)':'Variance'})
pyma_sum_df = pd.merge(pyma_sum_df, tumor_std, on='Drug Regimen')
pyma_sum_df = pyma_sum_df.rename(columns={'Tumor Volume (mm3)':'Standard Deviation'})
pyma_sum_df = pd.merge(pyma_sum_df, tumor_sem, on='Drug Regimen')
pyma_sum_df = pyma_sum_df.rename(columns={'Tumor Volume (mm3)':'SEM'})
pyma_sum_df

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [732]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
pyma_agg_df = pyma_gb.agg({'Tumor Volume (mm3)':['mean', 'median', 'var', 'std', 'sem']})
pyma_agg_df = pyma_agg_df.rename(columns={'mean':'Mean',
                                          'median':'Median',
                                          'var':'Variance',
                                          'std':'Standard Deviation',
                                          'sem':'SEM'})
pyma_agg_df

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [733]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
pyma_bar_count = pyma_gb['Mouse ID'].nunique()
pyma_bar_count.plot(kind='bar', title='Mice per Drug Regimen')
#plt.title("Mice per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Count of Mice")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [734]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
drugs = pyma_df['Drug Regimen'].unique()
x_axis = np.arange(len(drugs))
x_axis
y_axis = pyma_gb['Mouse ID'].nunique()
y_axis
plt.bar(x_axis, y_axis, color='r', alpha=0.5, align="center")
plt.xticks(x_axis, drugs, rotation="vertical")
plt.title("Mice per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Count of Mice")
plt.show()
plt.tight_layout()

<IPython.core.display.Javascript object>

In [870]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
values_df = pyma_df.groupby('Sex').nunique(['Mouse ID'])
labels = pyma_df['Sex'].unique()
values_df = pyma_df.groupby('Sex').nunique(['Mouse ID'])
values = values_df['Mouse ID']
colors = ["lightskyblue", "pink"]
values_df.plot(kind='pie', y = 'Mouse ID', autopct="%1.1f%%")
plt.title("Mice by Sex")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Mice by Sex')

In [736]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = pyma_df['Sex'].unique()
values_df = pyma_df.groupby('Sex').nunique(['Mouse ID'])
values = values_df['Mouse ID']
colors = ["lightskyblue", "pink"]
plt.pie(values, labels=labels, colors=colors, autopct="%1.1f%%", shadow=True, startangle=90)
plt.title("Mice by Sex")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Mice by Sex')

## Quartiles, Outliers and Boxplots

In [813]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
id_gb = pyma_df.groupby('Mouse ID').max()
id_gb = id_gb[['Timepoint', 'Drug Regimen']]
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
pyma_merge_df = pd.merge(pyma_df, id_gb, on='Mouse ID')
pyma_time_df = pyma_merge_df[pyma_merge_df['Timepoint_x'] == pyma_merge_df['Timepoint_y']]
pyma_rename_df = pyma_time_df[['Mouse ID', 'Drug Regimen_x', 'Tumor Volume (mm3)', 'Timepoint_x', 'Timepoint_y']]
pyma_maxtime_df = pyma_rename_df.rename(columns={'Drug Regimen_x':'Drug Regimen',
                                                 'Tumor Volume (mm3)_x':'Tumor Volume (mm3)',
                                                 'Timepoint_x':'Timepoint'})
print(pyma_maxtime_df.head())

   Mouse ID Drug Regimen  Tumor Volume (mm3)  Timepoint  Timepoint_y
9      k403     Ramicane           22.050126         45           45
19     s185    Capomulin           23.343598         45           45
29     x401    Capomulin           28.484033         45           45
39     m601    Capomulin           28.430964         45           45
49     g791     Ramicane           29.128472         45           45


In [865]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
tumors = []
lowerqs = []
upperqs = []
iqrs = []
median = []
lower_bounds = []
upper_bounds = []
outliers = []
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
pyma_index_df = pyma_maxtime_df.set_index('Drug Regimen')
pyma_treatment_df = pyma_index_df.loc[treatments]
for drug in treatments:
    drug_quartiles = []
    lower_bound = 0
    upper_bound = 0
    iqr = 0

    drug_quartiles = pyma_treatment_df.loc[drug,'Tumor Volume (mm3)'].quantile([.25,.5,.75])
    lowerq = drug_quartiles[0.25]
    lowerqs.append(lowerq)
    upperq = drug_quartiles[0.75]
    upperqs.append(upperq)
    iqr = upperq-lowerq
    iqrs.append(iqr)
    median.append(drug_quartiles[0.5])
    lower_bound = lowerq - (1.5*iqr)
    lower_bounds.append(lower_bound)
    upper_bound = upperq + (1.5*iqr)
    upper_bounds.append(upper_bound)
    print(" ")
    print(f"The lower quartile of the final Tumor Volume for {drug} is: {round(lowerq,2)} mm3")
    print(f"The upper quartile of the final Tumor Volume for {drug} is: {round(upperq,2)} mm3")
    print(f"The interquartile range of the final Tumor Volume for {drug} is: {round(iqr,2)} mm3")
    print(f"The the median of the final Tumor Volume for {drug} is: {round(drug_quartiles[0.5],2)} mm3")
    print(" ")
    pyma_test_df = pyma_treatment_df.loc[drug]
    pyma_loc_df = pyma_test_df.loc[(pyma_test_df['Tumor Volume (mm3)'] < lower_bound) | (pyma_test_df['Tumor Volume (mm3)'] > upper_bound)]
    
    for index, row in pyma_loc_df.iterrows():
        outliers.append(row['Mouse ID'])

quar_out_df = pd.DataFrame({"Drug Regimen":treatments, "Lower Quartiles":lowerqs,
                            "Upper Quartiles":upperqs, "Inter Quartiles":iqrs,
                            "Median":median, "Lower Bounds":lower_bounds,
                            "Upper Bounds":upper_bounds})    
print(f"The following Mice are outliers: {outliers}")

 
The lower quartile of the final Tumor Volume for Capomulin is: 32.38 mm3
The upper quartile of the final Tumor Volume for Capomulin is: 40.16 mm3
The interquartile range of the final Tumor Volume for Capomulin is: 7.78 mm3
The the median of the final Tumor Volume for Capomulin is: 38.13 mm3
 
 
The lower quartile of the final Tumor Volume for Ramicane is: 31.56 mm3
The upper quartile of the final Tumor Volume for Ramicane is: 40.66 mm3
The interquartile range of the final Tumor Volume for Ramicane is: 9.1 mm3
The the median of the final Tumor Volume for Ramicane is: 36.56 mm3
 
 
The lower quartile of the final Tumor Volume for Infubinol is: 54.05 mm3
The upper quartile of the final Tumor Volume for Infubinol is: 65.53 mm3
The interquartile range of the final Tumor Volume for Infubinol is: 11.48 mm3
The the median of the final Tumor Volume for Infubinol is: 60.17 mm3
 
 
The lower quartile of the final Tumor Volume for Ceftamin is: 48.72 mm3
The upper quartile of the final Tumor Volu

In [871]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
volume_df = pyma_treatment_df.reset_index()
volume = volume_df[['Drug Regimen', 'Tumor Volume (mm3)']]
boxplot = volume.boxplot(by='Drug Regimen', flierprops={'marker': 'o', 'markersize': 10, 'markerfacecolor': 'red'})
plt.title("for Tumor Volume (mm3)")
plt.xlabel("Drug Regimen")
plt.ylabel("Tumor Volume (mm3)")
plt.show()
print("Observation 1: Ceftamin and Infubinol were not as successful as Ramicane and Capomulin at controlling Tumor Volume")

<IPython.core.display.Javascript object>

Observation 1: Ceftamin and Infubinol were not as successful as Ramicane and Capomulin at controlling Tumor Volume


## Line and Scatter Plots

In [872]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
# Set x axis and variables
s185_df = pyma_df.set_index('Mouse ID').loc['s185']
time = s185_df['Timepoint']
volume = s185_df['Tumor Volume (mm3)']
mice, = plt.plot(time, volume, marker="+",color="blue", linewidth=1, label="TBD")
plt.title("Tumor Volume vs Time for s185 on Capomulin")
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
print("Observation 2: Capomulin was wevry succesful at reducing Tumor Volume for Mouse s185")

<IPython.core.display.Javascript object>

Observation 2: Capomulin was wevry succesful at reducing Tumor Volume for Mouse s185


In [760]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

cap_df = pyma_df.set_index('Drug Regimen')
cap_df = cap_df.loc['Capomulin']
cap_df = cap_df.groupby('Mouse ID')
cap_df = cap_df.mean()
weight = cap_df['Weight (g)']
tumor_volume = cap_df['Tumor Volume (mm3)']
plt.scatter(tumor_volume, weight, marker="o", facecolors="red", edgecolors="black", alpha=0.75)
plt.title("Average Tumor Volume vs Mouse Weight")
plt.xlabel("Tumor Volume (mm3)")
plt.ylabel("Weight (g)")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Weight (g)')

## Correlation and Regression

In [763]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
cap_df = pyma_df.set_index('Drug Regimen')
cap_df = cap_df.loc['Capomulin']
cap_df = cap_df.groupby('Mouse ID')
cap_df = cap_df.mean()
weight = cap_df['Weight (g)']
tumor_volume = cap_df['Tumor Volume (mm3)']
correlation = st.pearsonr(tumor_volume,weight)
print(f"The correlation between both factors is {round(correlation[0],2)}")

The correlation between both factors is 0.84


In [873]:
x_values = tumor_volume
y_values = weight
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(40,18),fontsize=15,color="red")
plt.xlabel("Tumor Volume (mm3)")
plt.ylabel("Weight (g)")
print(f"The r-squared is: {rvalue**2}")
plt.show()
print("Observation 3: There is a strong correlation between Mouse Weight and Tumor Volume")

<IPython.core.display.Javascript object>

The r-squared is: 0.7088568047708719
Observation 3: There is a strong correlation between Mouse Weight and Tumor Volume
