## Observations and Insights

## Dependencies and starter code

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)


In [None]:
# Sample mouse_metadata
mouse_metadata.head()

In [None]:
# Sample study_results
study_results.head()

In [None]:
# Combine the data into a single dataset
singledata = pd.merge(mouse_metadata, study_results, on="Mouse ID")
singledata.head()

## Summary statistics

In [2]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

In [None]:
# Mean

# Group by drug
mean = singledata.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"]

# Convert to DataFrame
mean = pd.DataFrame(mean)

# Flatten the column headings
mean = mean.reset_index()

# Rename
mean = mean.rename(columns={"Tumor Volume (mm3)": "Tumor Volume (Mean)"})

mean

In [None]:
# Median

# Group by drug
median = singledata.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"]

# Convert to DataFrame
median = pd.DataFrame(median)

# Flatten the column headings
median = median.reset_index()

# Rename 
median = median.rename(columns={"Tumor Volume (mm3)": "Tumor Volume (Median)"})

median

In [None]:
# Variance 

# Group by drug
variance = singledata.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"]

# Convert to DataFrame
variance = pd.DataFrame(variance)

# Flatten the column headings
variance = variance.reset_index()

# Rename the column to differentiate from the other calculations
variance = variance.rename(columns={"Tumor Volume (mm3)": "Tumor Volume (Variance)"})

variance

In [None]:
# Standard Deviation

# Group by drug
standardDeviation = singledata.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"]

# Convert to DataFrame
standardDeviation = pd.DataFrame(standardDeviation)

# Flatten the column headings
standardDeviation = standardDeviation.reset_index()

# Rename the column to differentiate from the other calculations
standardDeviation = standardDeviation.rename(columns={"Tumor Volume (mm3)": "Tumor Volume (Standard Deviation)"})

standardDeviation

In [None]:
# Standard Error (SEM)

# Store the standard error Tumor Volume Data Grouped by Drug 
standardError = singledata.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"]

# Convert to DataFrame
standardError = pd.DataFrame(standardError)

# Flatten the column headings
standardError = standardError.reset_index()

# Rename the column to differentiate from the other calculations
standardError = standardError.rename(columns={"Tumor Volume (mm3)": "Tumor Volume (Standard Error)"})

standardError

In [None]:
# Create combined dataframe
combined_df = {}
pd.DataFrame(combined_df)

# Merge data into the dataframe
combined_df = pd.merge(mean, median, on="Drug Regimen")
combined_df = pd.merge(combined_df, variance, on="Drug Regimen")
combined_df = pd.merge(combined_df, standardDeviation, on="Drug Regimen")
combined_df = pd.merge(combined_df, standardError, on="Drug Regimen")
combined_df

## Bar plots

In [3]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

ax = plt.subplot()
ax.bar(x_axis, combined_df['Tumor Volume (Mean)'], width=0.25, align='center')
ax.bar(x_axis, combined_df['Tumor Volume (Median)'], width=0.25, align='center')
ax.bar(x_axis, combined_df['Tumor Volume (Mode)'], width=0.25, align='center')
ax.bar(x_axis, combined_df['Tumor Volume (Variance)'], width=0.25, align='center')
ax.bar(x_axis, combined_df['Tumor Volume (Standard Deviation)'], width=0.25, align='center')
ax.bar(x_axis, combined_df['Tumor Volume (Standard Error)'], width=0.25, align='center')


In [4]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

x_axis = combined_df['Drug Regimen']
plt.bar(x_axis, combined_df['Tumor Volume (Mean)'], width=0.5, align='center')
plt.bar(x_axis, combined_df['Tumor Volume (Median)'], width=0.5, align='center')
plt.bar(x_axis, combined_df['Tumor Volume (Mode)'], width=0.5, align='center')
plt.bar(x_axis, combined_df['Tumor Volume (Variance)'], width=0.5, align='center')
plt.bar(x_axis, combined_df['Tumor Volume (Standard Deviation)'], width=0.5, align='center')
plt.bar(x_axis, combined_df['Tumor Volume (Standard Error)'], width=0.5, align='center')

# Set x axis and tick locations
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, combined_df['Drug Regimen'], rotation="vertical")

plt.show()

## Pie plots

In [5]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [6]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, outliers and boxplots

In [7]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [8]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [9]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [10]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [11]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen