## Observations and Insights 

In [24]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np 

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mice_data_df=pd.merge(mouse_metadata, study_results, on='Mouse ID', how='outer')

# Display the data table for preview
mice_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [25]:
# Checking the number of mice.
number_of_mice = len(mice_data_df)
number_of_mice

1893

In [26]:
number_of_mice = mice_data_df['Mouse ID'].unique()
len(number_of_mice)

249

In [27]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
group_mice = mice_data_df.groupby('Drug Regimen')
duplicate_mice_df = group_mice[['Timepoint']].count()
duplicate_mice_df

Unnamed: 0_level_0,Timepoint
Drug Regimen,Unnamed: 1_level_1
Capomulin,230
Ceftamin,178
Infubinol,178
Ketapril,188
Naftisol,186
Placebo,181
Propriva,161
Ramicane,228
Stelasyn,181
Zoniferol,182


In [28]:
# Optional: Get all the data for the duplicate mouse ID. 



In [49]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.  
mice_data_df.sort_values("Mouse ID", inplace = True) 
clean_data_df = mice_data_df.drop_duplicates(subset = 'Mouse ID', keep = 'first') 
clean_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
460,a203,Infubinol,Female,20,23,30,59.523197,1
473,a251,Infubinol,Female,21,25,45,65.525743,1
474,a262,Placebo,Female,17,29,0,45.000000,0
493,a275,Ceftamin,Female,20,28,45,62.999356,3
498,a366,Stelasyn,Female,16,29,20,57.285987,0
...,...,...,...,...,...,...,...,...
1860,z435,Propriva,Female,12,26,0,45.000000,0
77,z578,Ramicane,Male,11,16,10,42.952118,0
1867,z581,Infubinol,Female,24,25,20,51.807944,2
1874,z795,Naftisol,Female,13,29,5,46.833475,0


In [51]:
# Checking the number of mice in the clean DataFrame.
len(clean_data_df)

249

## Summary Statistics

In [83]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_volume = clean_data_df[['Tumor Volume (mm3)','Drug Regimen']]
regimen_df = tumor_volume.groupby('Drug Regimen')
tumor_mean['Tumor Mean'] = regimen_df.mean()
tumor_median['Tumor Median'] = regimen_df.median()
tumor_var['Tumor Variance'] = regimen_df.var()
tumor_std['Tumor Standard Deviation'] = regimen_df.std()
tumor_sem['Tumor SEM'] = regimen_df.sem()
sum_data_df = pd.merge(tumor_mean, tumor_median, on='Drug Regimen', how='outer')
sum_data_df = sum_data_df.drop(['Tumor Volume (mm3)_x'], axis=1)
sum_data_df = sum_data_df.drop(['Tumor Volume (mm3)_y'], axis=1)
sum_data_df = pd.merge(sum_data_df, tumor_var, on='Drug Regimen', how='outer')
sum_data_df = pd.merge(sum_data_df, tumor_std, on='Drug Regimen', how='outer')
sum_data_df = pd.merge(sum_data_df, tumor_sem, on='Drug Regimen', how='outer')
sum_data_df = sum_data_df.drop(['Tumor Volume (mm3)_x'], axis=1)
sum_data_df = sum_data_df.drop(['Tumor Volume (mm3)_y'], axis=1)
sum_data_df = sum_data_df.drop(['Tumor Volume (mm3)'], axis=1)
sum_data_df
#Calculating the Mean 

# This method is the most straighforward, creating multiple series and putting them all together at the end.



Unnamed: 0_level_0,Tumor Mean,Tumor Median,Tumor Variance,Tumor Standard Deviation,Tumor SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,39.867724,40.658124,42.415818,6.512743,1.302549
Ceftamin,51.333101,49.68362,34.839795,5.902524,1.180505
Infubinol,51.751417,51.807944,45.073341,6.713668,1.342734
Ketapril,52.81196,51.236606,52.831521,7.268529,1.453706
Naftisol,50.708768,47.067744,55.540123,7.452525,1.490505
Placebo,50.229134,46.220796,50.049765,7.074586,1.414917
Propriva,51.123609,49.145709,54.185608,7.361087,1.472217
Ramicane,41.261614,43.339161,20.015306,4.473847,0.894769
Stelasyn,51.877632,50.553347,41.841103,6.46847,1.320371
Zoniferol,48.40866,46.818176,22.185043,4.7101,0.94202


In [88]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
