## Observations and Insights 

In [22]:
# Dependencies and Setup
from pprint import pprint
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
study_results = study_results.dropna(axis=1)
merged = mouse_metadata.merge(study_results, on='Mouse ID')
merged.to_csv("output.csv", index=False)

# Display the data table for preview, and drop extra columns
merged_df = pd.read_csv("output.csv")
merged_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [23]:
# Checking the number of mice.
merged_df["Mouse ID"].nunique()

249

In [24]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

# Locating data with duplicates of both criteria, and displaying by Mouse ID
duplicates_df = merged_df.loc[merged_df.duplicated(subset = ["Mouse ID","Timepoint"], keep = False) == True]
duplicates_df = pd.DataFrame(duplicates_df["Mouse ID"].drop_duplicates())
duplicates_df

Unnamed: 0,Mouse ID
908,g989


In [25]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_data_df = pd.DataFrame(duplicates_df)
dup_data = duplicate_data_df.merge(merged_df, on = "Mouse ID")
dup_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,g989,Propriva,Female,21,26,0,45.0,0
1,g989,Propriva,Female,21,26,0,45.0,0
2,g989,Propriva,Female,21,26,5,48.786801,0
3,g989,Propriva,Female,21,26,5,47.570392,0
4,g989,Propriva,Female,21,26,10,51.745156,0
5,g989,Propriva,Female,21,26,10,49.880528,0
6,g989,Propriva,Female,21,26,15,51.325852,1
7,g989,Propriva,Female,21,26,15,53.44202,0
8,g989,Propriva,Female,21,26,20,55.326122,1
9,g989,Propriva,Female,21,26,20,54.65765,1


In [26]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleandata_df = merged_df.copy()
cleandata_df = cleandata_df.loc[~(cleandata_df["Mouse ID"].isin(duplicates_df["Mouse ID"]))]
cleandata_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [27]:
# Checking the number of mice in the clean DataFrame.
cleandata_df["Mouse ID"].nunique()

248

## Summary Statistics

In [35]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

mean = round(cleandata_df.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"],3)
mean = mean.reset_index()
mean = mean.rename(columns={"Tumor Volume (mm3)":"Mean"})

median = round(cleandata_df.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"],3)
median = median.reset_index()
median = median.rename(columns={"Tumor Volume (mm3)":"Median"})

variance = round(cleandata_df.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"],3)
variance = variance.reset_index()
variance = variance.rename(columns={"Tumor Volume (mm3)":"Variance"})

std_dev = round(cleandata_df.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"],3)
std_dev = std_dev.reset_index()
std_dev = std_dev.rename(columns={"Tumor Volume (mm3)":"STD"})

sem = round(cleandata_df.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"],3)
sem = sem.reset_index()
sem = sem.rename(columns={"Tumor Volume (mm3)":"SEM"})


stats = ({ 
    "Mean": [mean], 
    "Median": [median], 
    "Variance": [variance], 
    "Standard Deviation": [std_dev],
    "SEM": [sem]
})

stats_df = pd.DataFrame(stats, index = ["Mean", "Median", "Variance", "Standard Deviation", "SEM"])
stats_df

# gender_demo = ({"Total Count":[F_count, M_count, ON_count],
#               "Percentage of Players":[F_percent, M_percent, ON_percent]})

# gender_demo_df = pd.DataFrame(gender_demo, index = ["Female", "Male", "Other / Non-Disclosed"])

# # stats_summary_df = reduce(lambda left,right: pd.merge(left,right,on=['Drug Regimen'], how='inner'), stats_df)
# stats_summary_df


SyntaxError: invalid syntax (<ipython-input-35-1e51b4deca6c>, line 31)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
