## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
study_data_complete = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

# Display the data table for preview
study_data_complete.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [6]:
# Checking the number of mice.
numMice = len(study_data_complete)
numMice1 = len(mouse_metadata)
numMice2 = len(study_results)
numMice3 = len(study_data_complete["Mouse ID"].unique()) 
print(numMice, numMice1, numMice2, numMice3)
print("Number of mice: "+ str(numMice3))

1893 249 1893 249
Number of mice: 249


In [11]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplMice = study_data_complete["Mouse ID"].value_counts()
duplMice.head()

g989    13
l733    10
a411    10
j296    10
l897    10
Name: Mouse ID, dtype: int64

In [29]:
# Optional: Get all the data for the duplicate mouse ID. 
# use collections.Counter reduce repeat call to count, get items(ketys : value) in Counter
from collections import Counter
duplMice = study_data_complete["Mouse ID"]
d = Counter(map(tuple, duplMice))
#print(Counter(duplMice))

# get the duplicate keys - Mouse ID
dups = [k for k, v in d.items() if v>1]
print(Counter(duplMice).keys())
# print(d.keys()) =>dict_keys([('b', '1', '2', '8'), ('f', '9', '3', '2')...?


# set the index to the mouse ID

# check the mouse data for ID


dict_keys(['b128', 'f932', 'g107', 'a457', 'c819', 'h246', 'p189', 'n923', 'q119', 'f993', 'z234', 'b559', 'y260', 'x930', 'o725', 'z969', 'v835', 'r604', 'n304', 'l700', 'x336', 'l725', 'm133', 'v295', 'a818', 'y601', 't724', 'k382', 'w422', 'c326', 'c139', 'v339', 'a577', 'y163', 'k483', 'k804', 'o809', 'z581', 'a251', 'i386', 'c580', 'q132', 'u327', 'v603', 'f278', 'g497', 'd474', 'o973', 'c832', 'o331', 'm650', 'v289', 'm550', 'h428', 'r701', 'v199', 'x264', 'f234', 'c458', 'q610', 'j913', 'a411', 'a444', 'd251', 'j989', 'y449', 'k403', 'c758', 'x402', 'r811', 'a644', 'i177', 'g791', 'a520', 'u196', 'm546', 'w678', 'n364', 's508', 'e662', 'z578', 'r921', 'a492', 'w540', 'v764', 'z795', 'e291', 'e584', 'e213', 'j755', 's565', 'a366', 'p387', 'b879', 'i901', 'k862', 'g867', 's619', 'w697', 'j984', 'c402', 'h333', 'k510', 'p981', 't451', 'a963', 'm269', 'g989', 'z314', 'o848', 'v719', 'q597', 'c895', 'a203', 'f394', 'c264', 'n967', 'f545', 'k894', 'k754', 'g296', 'd164', 'w575', 'x613

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.


In [None]:
# Checking the number of mice in the clean DataFrame.


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

# series variable to hold Tumor Volume Data grouped by Drug Regimen

# variable to hold the Mean Tumor Volume Data Grouped by Drug Regimen


# variable to hold median Tumor Volume Data Grouped by Drug Regimen


# variable to hold the Tumor Volume Variance Data Grouped by Drug Regimen


# variable to hold the Tumor Volume Standard Deviation Data Grouped by Drug Regimen


# variable to hold the Tumor Volume SEM Data Grouped by Drug Regimen


# Convert to DataFrame

# Preview DataFrame


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

# list of unique drug regimens

# drug regimen as x-axis values for plotting

# drop all duplicate mice


# get mice counts per drug


In [None]:
# plot the mouse counts for each drug using pandas


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
# plot the bar graph of mice count per drug regimen


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# set drug regimen as index and drop associated regimens while only keeping Capomulin, Ramicane, Infubinol, and Ceftamin


# isolated view of just capomulin for later use


# Reset index so drug regimen column persists after inner merge


# get mouse count per drug


# Start by getting the last (greatest) timepoint for each mouse


In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

# show all rows of data


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
#set drugs to be analyzed, colors for the plots, and markers


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
# Locate the rows which contain mice on each drug and get the tumor volumes
 

In [None]:
# Determine outliers using upper and lower bounds


In [None]:
# add subset 


# tumor volumes for each Drug Regimen


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest




## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

#change index to mouse ID 

#remove other mouse IDs so only s185 shows


#set the x-axis equal to the Timepoint and y-axis to Tumor Volume


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
# group by mouse ID to find average tumor volume


# establish x-axis value for the weight of the mice


# produce scatter plot of the data


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
#establish x and y values and find St. Pearson Correlation Coefficient for Mouse Weight and Tumor Volume Avg


#print St. Pearson Correlation Coefficient

# establish linear regression values

# linear regression line 


# scatter plot of the data
