## Observations and Insights 

In [101]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)
# Combine the data into a single dataset
combined= mouse_metadata.merge(study_results, on="Mouse ID", how='left')
# Display the data table for preview
combined.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


# Checking the number of mice.
n1=New_df["Mouse ID"].count()
n1

In [73]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_metadata['Mouse ID'].value_counts()

a203    1
r811    1
a963    1
n630    1
a788    1
       ..
c832    1
s166    1
c895    1
v603    1
v923    1
Name: Mouse ID, Length: 249, dtype: int64

In [74]:
up_df=combined[combined.duplicated()]
up_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0


In [75]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_drug_data=mouse_metadata["Mouse ID"]
mouse_drug_data

0      k403
1      s185
2      x401
3      m601
4      g791
       ... 
244    z314
245    z435
246    z581
247    z795
248    z969
Name: Mouse ID, Length: 249, dtype: object

In [76]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_drug_data=mouse_metadata.drop_duplicates(subset='Mouse ID', keep= False)
mouse_drug_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [77]:
# Checking the number of mice in the clean DataFrame.
No_Mice_df=pd.DataFrame(mouse_drug_data,columns=["Mouse ID"])
No_Mice_df.head()

Unnamed: 0,Mouse ID
0,k403
1,s185
2,x401
3,m601
4,g791


## Summary Statistics

In [119]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
Data_Volume = combined[["Drug Regimen","Timepoint","Tumor Volume (mm3)"]]

#df_GrpDrugTimeVolMean.head()
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
groupDrugTimeVol = Data_Volume.groupby(["Drug Regimen","Timepoint"])

# mean, median, variance, standard deviation, and SEM of the tumor volume. 
df_GrpDrugTimeVolMean = groupDrugTimeVol.mean()
df_GrpDrugTimeVolError = groupDrugTimeVol.sem()
df_GrpDrugTimeVolStd = groupDrugTimeVol.std()
df_GrpDrugTimeVolvar = groupDrugTimeVol.var()
df_GrpDrugTimeVolMean = df_GrpDrugTimeVolMean.reset_index()
dfPV_VolMean = df_GrpDrugTimeVolMean.pivot(index='Timepoint', columns='Drug Regimen', values='Tumor Volume (mm3)')
#dfPV_VolError = df_GrpDrugTimeVolError.pivot(index='Timepoint', columns='Drug Regimen', values='Tumor Volume (mm3)')
dfPV_VolMean
# Assemble the resulting series into a single summary dataframe.




Drug Regimen,Capomulin,Ceftamin,Infubinol,Ketapril,Naftisol,Placebo,Propriva,Ramicane,Stelasyn,Zoniferol
Timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
5,44.266086,46.503051,46.541247,47.389175,46.796098,47.125589,47.248967,43.944859,47.47083,46.851818
10,43.084291,48.285125,49.403909,49.582269,48.69421,49.423329,49.101541,42.531957,49.335368,48.689881
15,42.064317,50.094055,51.296397,52.399974,50.933018,51.359742,51.067318,41.495061,51.448025,50.779059
20,40.716325,52.157049,53.197691,54.920935,53.644087,54.364417,53.346737,40.238325,53.97008,53.170334
25,39.939528,54.287674,55.715252,57.678982,56.731968,57.482574,55.504138,38.9743,56.172821,55.432935
30,38.769339,56.769517,58.299397,60.994507,59.559509,59.809063,58.196374,38.703137,59.870528,57.713531
35,37.816839,58.827548,60.742461,63.371686,62.685087,62.420615,60.350199,37.451996,62.432021,60.089372
40,36.958001,61.467895,63.162824,66.06858,65.600754,65.052675,63.045537,36.574081,65.356386,62.916692
45,36.236114,64.132421,65.755562,70.662958,69.265506,68.084082,66.258529,34.955595,68.43831,65.960888


In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
