## Observations and Insights 

In [14]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

#Combine the data into a single dataset

merged_data = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="left")

merged_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [2]:
# Checking the number of mice in the DataFrame.
merged_data.count()

Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [18]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merged_data_cleaned = merged_data.drop_duplicates(subset = ["Mouse ID","Timepoint"],keep="first")
merged_data_cleaned.count()

Mouse ID              1888
Drug Regimen          1888
Sex                   1888
Age_months            1888
Weight (g)            1888
Timepoint             1888
Tumor Volume (mm3)    1888
Metastatic Sites      1888
dtype: int64

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [19]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
merged_data_cleaned.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [20]:
# Checking the number of mice in the clean DataFrame.
merged_data_cleaned.count()

Mouse ID              1888
Drug Regimen          1888
Sex                   1888
Age_months            1888
Weight (g)            1888
Timepoint             1888
Tumor Volume (mm3)    1888
Metastatic Sites      1888
dtype: int64

## Summary Statistics

In [43]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
#merged_data_cleaned["Drug Regimen"].unique()
#['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin','Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']

#ramicane mean summary
ramicane_summary = merged_data_cleaned.loc[merged_data_cleaned["Drug Regimen"] == "Ramicane"]
ramicane_mean_tuvol = ramicane_summary["Tumor Volume (mm3)"].mean()

#ramicane median summary
ramicane_summary = merged_data_cleaned.loc[merged_data_cleaned["Drug Regimen"] == "Ramicane"]
ramicane_median_tuvol = ramicane_summary["Tumor Volume (mm3)"].median()

#ramicane variance summary
ramicane_summary = merged_data_cleaned.loc[merged_data_cleaned["Drug Regimen"] == "Ramicane"]
ramicane_var_tuvol = ramicane_summary["Tumor Volume (mm3)"].var()

#ramicane std summary
ramicane_summary = merged_data_cleaned.loc[merged_data_cleaned["Drug Regimen"] == "Ramicane"]
ramicane_std_tuvol = ramicane_summary["Tumor Volume (mm3)"].std()

#ramicane sem summary
ramicane_summary = merged_data_cleaned.loc[merged_data_cleaned["Drug Regimen"] == "Ramicane"]
ramicane_sem_tuvol = ramicane_summary["Tumor Volume (mm3)"].sem()

#ramicane tumor volume summary dataframe
ramicane_summary_tuvol_df = pd.DataFrame([{"Drug Regimen" : "Ramicane"
                                          ,"Mean" : ramicane_mean_tuvol
                                         ,"Median" : ramicane_median_tuvol
                                         ,"Variance" : ramicane_var_tuvol
                                         ,"Standard Deviation" : ramicane_std_tuvol
                                          ,"SEM" : ramicane_sem_tuvol}])
ramicane_summary_tuvol_df
#need to do dataframe summaries for other drug regimens
# This method is the most straightforward, creating multiple series and putting them all together at the end.

Unnamed: 0,Drug Regimen,Mean,Median,Variance,Standard Deviation,SEM
0,Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.


## Bar Plots

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
