## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_pd = pd.merge(mouse_metadata, study_results, on="Mouse ID", how = "outer")
merged_pd.head()

In [2]:
# Checking the number of mice in the DataFrame.
len(merged_pd["Mouse ID"].unique())

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merged_pd.groupby(["Mouse ID","Timepoint"]).count()

In [4]:
# Optional: Get all the data for the duplicate mouse ID. 


In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.


In [6]:
# Checking the number of mice in the clean DataFrame.


## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
Summary_df = merged_pd[["Drug Regimen", "Tumor Volume (mm3)"]].copy()
Summary_count_df = Summary_df.groupby(["Drug Regimen"]).count()
Summary_sum_df = Summary_df.groupby(["Drug Regimen"]).sum()
Summary_median_df = Summary_df.groupby(["Drug Regimen"]).median()
Summary_stdev_df = Summary_df.groupby(["Drug Regimen"]).std()
Summary_sem_df = Summary_df.groupby(["Drug Regimen"]).sem()
FINAL_pd = pd.merge(Summary_count_df, Summary_sum_df, on = "Drug Regimen", how = "outer")
FINAL_pd = pd.merge(FINAL_pd, Summary_median_df, on = "Drug Regimen", how = "outer")
FINAL_pd = FINAL_pd.rename(columns={"Tumor Volume (mm3)_x":"Count", "Tumor Volume (mm3)_y":"Sum", "Tumor Volume (mm3)": "Median"})
#Summary_df=Summary_df.describe()
FINAL_pd["Mean"]=FINAL_pd.Sum/FINAL_pd.Count
FINAL_pd = pd.merge(FINAL_pd, Summary_stdev_df, on = "Drug Regimen", how = "outer")
FINAL_pd = FINAL_pd.rename(columns={"Tumor Volume (mm3)":"Standard Deviation"})
FINAL_pd["Variance"]=FINAL_pd["Standard Deviation"]**2
FINAL_pd = pd.merge(FINAL_pd, Summary_sem_df, on = "Drug Regimen", how = "outer")
FINAL_pd = FINAL_pd.rename(columns={"Tumor Volume (mm3)":"SEM"})
FINAL_pd
#Summary_df

# This method is the most straightforward, creating multiple series and putting them all together at the end.

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
Summary_df = merged_pd[["Drug Regimen", "Tumor Volume (mm3)"]].copy()
single_df=Summary_df.groupby("Drug Regimen")
single_df.describe()
# This method produces everything in a single groupby function.

## Bar Plots

In [9]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.
bar_df = merged_pd[["Drug Regimen", "Tumor Volume (mm3)","Timepoint"]].copy()
bar_df=bar_df.groupby(["Drug Regimen", "Timepoint"]).count()
bar_df.plot(kind="bar", figsize=(20,5))

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.
bar_df = merged_pd[["Drug Regimen", "Tumor Volume (mm3)","Timepoint"]].copy()
bar_df=bar_df.groupby(["Drug Regimen", "Timepoint"]).count()
x = np.arange(len(bar_df))
plt.bar(x, bar_df["Tumor Volume (mm3)"], color='r', alpha=0.5, align="edge")

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_pd = merged_pd[["Sex", "Mouse ID"]].copy()
sex_pd.head()
sex_pd= sex_pd.groupby(["Sex"]).count()
sex_pd
sex_pd.plot.pie(y='Mouse ID', figsize=(5, 5))

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex_pd = merged_pd[["Sex", "Mouse ID"]].copy()
sex_pd.head()
sex_pd= sex_pd.groupby(["Sex"]).count()
sex_pd
plt.pie(sex_pd["Mouse ID"])

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Line_pd = merged_pd[["Drug Regimen", "Mouse ID", "Tumor Volume (mm3)"]].copy
Line_pd = pd.DataFrame(merged_pd)
Line_pd
#Line_pd.set_index('Drug Regimen')

In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
