# Observations and Insights

## Dependencies and starter code

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st # Used for SEM Calcs(?)
import numpy as np

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)


In [2]:
# Data check:  mouse_metadata
mouse_metadata.head()
#mouse_metadata.describe()
#mouse_metadata.dtypes
#mouse_metadata.count

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [3]:
# Data check:  study_results
study_results.head()
#study_results.describe()
#study_results.dtypes
#study_results.count

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [4]:
# Combine the data into a single dataset
study_results_complete = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID", "Mouse ID"])

In [5]:
# Data check:  study_results_complete
study_results_complete.head()
#study_results_complete.describe()
#study_results_complete.dtypes
#study_results_complete.count
#study_results.groupby('Mouse ID')['Timepoint'].count()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


#### Note:  Look across all figures and tables below and write at least three observations or inferences that can be made from the data.  Include them at the top of this notebook.

#### Hints and Considerations
* Use proper labeling of your plots, to include properties such as: plot titles, axis labels, legend labels, x-axis and y-axis limits, etc.

## Summary statistics
#### Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

In [6]:
study_results_complete['Drug Regimen'].unique()

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [7]:
tumor_volume = study_results_complete['Tumor Volume (mm3)']

In [8]:
mean_tumor_volume = np.mean(tumor_volume)
print(f"The total mean tumor volume is {mean_tumor_volume}")

The total mean tumor volume is 50.448380631336505


In [13]:
study_results_complete.groupby(['Drug Regimen','Tumor Volume (mm3)']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Weight (g),Timepoint,Metastatic Sites
Drug Regimen,Tumor Volume (mm3),Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,23.343598,3.0,17.0,45.0,1.0
Capomulin,25.472143,3.0,17.0,40.0,1.0
Capomulin,28.167397,16.0,15.0,40.0,0.0
Capomulin,28.328531,3.0,17.0,35.0,1.0
Capomulin,28.430964,22.0,17.0,45.0,1.0
...,...,...,...,...,...
Zoniferol,68.498639,12.0,25.0,40.0,2.0
Zoniferol,68.611061,2.0,28.0,45.0,3.0
Zoniferol,70.827796,12.0,25.0,45.0,2.0
Zoniferol,71.108118,20.0,26.0,40.0,1.0


# Create BINs!!!

In [9]:
tumor_volume_ramicane = study_results_complete['Tumor Volume (mm3)'][study_results_complete['Drug Regimen'] =='Ramicane']

In [10]:
mean_tumor_volume_ramicane = np.mean(tumor_volume)
print(f"The total mean tumor volume for Ramicane is {mean_tumor_volume_ramicane}")

The total mean tumor volume for Ramicane is 50.448380631336505


In [12]:
ramicane_tu_vol = avg(study_results_complete['Mouse ID'][study_results_complete['Drug Regimen'] =='Ramicane'])
ramicane_tu_vol

NameError: name 'avg' is not defined

In [None]:
# Isolate data for each regimen
ramicane_tu_vol_mean = np.mean(ramicane_tu_vol['Tumor Volume (mm3)'].apply(float))

In [None]:
# Mean of the tumor volume for each regimen
ramicane = study_results_complete['Drug Regimen']('Ramicane')
ramicane_mean = np.mean(ramicane)

In [None]:
# Median of the tumor volume for each regimen

In [None]:
# Variance of the tumor volume for each regimen

In [None]:
# standard deviation of the tumor volume for each regimen

In [None]:
# Standard Error of Mean (SEM) of the tumor volume for each regimen
# See matplotlib_03_04-Ins_Standard_Error

## Bar plots
*The following plots should look identical*

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen...
# Using pandas 'DataFrame.plot()'

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen...
# Using pyplot

## Pie plots
*The following plots should look identical*

In [None]:
# Generate a pie plot showing the distribution of female versus male mice...
# Using pandas 'DataFrame.plot()'

In [None]:
# Generate a pie plot showing the distribution of female versus male mice...
# Using pyplot

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin

In [None]:
# Calculate the quartiles and IQR and quantitatively determine if there are any potential outliers across
# all four treatment regimens

In [None]:
# Using Matplotlib, generate a box and whisker plot of the final tumor volume of each mouse across four
# treatment regimens of interest and highlight any potential outliers in the plot by changing their color and style

## Hint: All four box plots should be within the same figure. Use this Matplotlib documentation page for help
## with changing the style of the outliers.
# https://matplotlib.org/gallery/pyplots/boxplot_demo_pyplot.html#sphx-glr-gallery-pyplots-boxplot-demo-pyplot-py

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a single mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin treatment regimen

In [None]:
# Calculate the correlation coefficient and linear regression model between mouse weight and average tumor volume
# for the Capomulin treatment. Plot the linear regression model on top of the previous scatter plot