## Observations and Insights 

In [118]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import sem
# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_df = pd.DataFrame.merge(mouse_metadata, study_results, how="outer", on="Mouse ID")

# Display the data table for preview
merged_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [119]:
#why are these considered duplicates when tumor volume is different?
merged_df["duplicate"]=merged_df["Mouse ID"]=="g989"

merged_df.iloc[900:930, ]

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
900,g570,Propriva,Male,16,29,30,60.328717,1,False
901,g570,Propriva,Male,16,29,35,62.33042,2,False
902,g570,Propriva,Male,16,29,40,68.525856,2,False
903,g570,Propriva,Male,16,29,45,70.492788,2,False
904,g867,Stelasyn,Female,9,25,0,45.0,0,False
905,g867,Stelasyn,Female,9,25,5,46.880749,0,False
906,g867,Stelasyn,Female,9,25,10,48.183402,0,False
907,g867,Stelasyn,Female,9,25,15,51.060419,1,False
908,g989,Propriva,Female,21,26,0,45.0,0,True
909,g989,Propriva,Female,21,26,0,45.0,0,True


In [120]:
# Checking the number of mice.
len(merged_df)

1893

In [121]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = merged_df[merged_df.duplicated(['Mouse ID', 'Timepoint'])]
duplicate_mice

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
909,g989,Propriva,Female,21,26,0,45.0,0,True
911,g989,Propriva,Female,21,26,5,47.570392,0,True
913,g989,Propriva,Female,21,26,10,49.880528,0,True
915,g989,Propriva,Female,21,26,15,53.44202,0,True
917,g989,Propriva,Female,21,26,20,54.65765,1,True


In [122]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df=merged_df.drop_duplicates(['Mouse ID', 'Timepoint'])

In [123]:
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
0,k403,Ramicane,Male,21,16,0,45.000000,0,False
1,k403,Ramicane,Male,21,16,5,38.825898,0,False
2,k403,Ramicane,Male,21,16,10,35.014271,1,False
3,k403,Ramicane,Male,21,16,15,34.223992,1,False
4,k403,Ramicane,Male,21,16,20,32.997729,1,False
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,False
1889,z969,Naftisol,Male,9,30,30,65.841013,3,False
1890,z969,Naftisol,Male,9,30,35,69.176246,4,False
1891,z969,Naftisol,Male,9,30,40,70.314904,4,False


In [124]:
# Checking the number of mice in the clean DataFrame.
len(clean_df)

1888

In [125]:
# number of mice deleted
len(merged_df)-len(clean_df)

5

## Summary Statistics

In [126]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_stat=clean_df[["Drug Regimen", "Tumor Volume (mm3)"]]
tumor_stat

Unnamed: 0,Drug Regimen,Tumor Volume (mm3)
0,Ramicane,45.000000
1,Ramicane,38.825898
2,Ramicane,35.014271
3,Ramicane,34.223992
4,Ramicane,32.997729
...,...,...
1888,Naftisol,63.145652
1889,Naftisol,65.841013
1890,Naftisol,69.176246
1891,Naftisol,70.314904


In [151]:
tumor_stat_mean=tumor_stat.groupby("Drug Regimen").mean()
print(tumor_stat_mean)
tumor_stat_median=tumor_stat.groupby("Drug Regimen").median()
print(tumor_stat_median)
tumor_stat_var=tumor_stat.groupby("Drug Regimen").var()
tumor_stat_std=tumor_stat.groupby("Drug Regimen").std()

              Tumor Volume (mm3)
Drug Regimen                    
Capomulin              40.675741
Ceftamin               52.591172
Infubinol              52.884795
Ketapril               55.235638
Naftisol               54.331565
Placebo                54.033581
Propriva               52.393463
Ramicane               40.216745
Stelasyn               54.233149
Zoniferol              53.236507
              Tumor Volume (mm3)
Drug Regimen                    
Capomulin              41.557809
Ceftamin               51.776157
Infubinol              51.820584
Ketapril               53.698743
Naftisol               52.509285
Placebo                52.288934
Propriva               50.909965
Ramicane               40.673236
Stelasyn               52.431737
Zoniferol              51.818479


In [145]:
tumor_stat_sem=tumor_stat.sem(axis=0)

In [157]:
tumor_stat_summ=pd.DataFrame(index=tumor_stat["Drug Regimen"].unique())
tumor_stat_summ= {"Mean":[tumor_stat_mean],"Median":[tumor_stat_median],"Variance":[tumor_stat_var],
                  "Standard Deviation":[tumor_stat_std]}
tumor_stat_summ

{'Mean': [              Tumor Volume (mm3)
  Drug Regimen                    
  Capomulin              40.675741
  Ceftamin               52.591172
  Infubinol              52.884795
  Ketapril               55.235638
  Naftisol               54.331565
  Placebo                54.033581
  Propriva               52.393463
  Ramicane               40.216745
  Stelasyn               54.233149
  Zoniferol              53.236507],
 'Median': [              Tumor Volume (mm3)
  Drug Regimen                    
  Capomulin              41.557809
  Ceftamin               51.776157
  Infubinol              51.820584
  Ketapril               53.698743
  Naftisol               52.509285
  Placebo                52.288934
  Propriva               50.909965
  Ramicane               40.673236
  Stelasyn               52.431737
  Zoniferol              51.818479],
 'Variance': [              Tumor Volume (mm3)
  Drug Regimen                    
  Capomulin              24.947764
  Ceftamin           

In [136]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
