In [76]:
# Setup

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

mouse_metadata = Path("data/Mouse_metadata.csv")
study_results = Path("data/Study_results.csv")

mouse_metadata_df = pd.read_csv(mouse_metadata)
study_results_df = pd.read_csv(study_results)

# display(mouse_metadata_df)
# display(study_results_df)

# Mouse_metadata contains identifying characteristics of each mouse as well as its assigned drug.
# Study_results is time-variant tumor data over 46 study days

study_complete_df = pd.merge(study_results_df, mouse_metadata_df, on="Mouse ID")
display(study_complete_df)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [77]:
# Prepare the data: display then remove duplicate mice-timepoint data, establish de-duplicated DataFrame, display updated total mice

# Find the number of mice
print(f"Number of unique Mouse IDs: " + str(len(mouse_metadata_df["Mouse ID"].value_counts())))

# Locate mice with duplicate timepoint data
study_complete_df["Is Duplicated"] = study_complete_df.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)
duplicates_df = study_complete_df.loc[study_complete_df["Is Duplicated"] == True, :]
print(f"Mice with duplicate timepoint data: " + str(duplicates_df["Mouse ID"].unique()))

# Display all data associated with above mice
duplicate_mouse_df = study_complete_df.loc[study_complete_df["Mouse ID"] == "g989", :]
display(duplicate_mouse_df)

# Remove above data to create a cleaned DataFrame
study_cleaned_df = study_complete_df.loc[study_complete_df["Mouse ID"] != "g989", :].drop("Is Duplicated", axis=1).reset_index(drop=True)
display(study_cleaned_df)

Number of unique Mouse IDs: 249
Mice with duplicate timepoint data: ['g989']


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g),Is Duplicated
107,g989,0,45.0,0,Propriva,Female,21,26,True
137,g989,0,45.0,0,Propriva,Female,21,26,True
329,g989,5,48.786801,0,Propriva,Female,21,26,True
360,g989,5,47.570392,0,Propriva,Female,21,26,True
620,g989,10,51.745156,0,Propriva,Female,21,26,True
681,g989,10,49.880528,0,Propriva,Female,21,26,True
815,g989,15,51.325852,1,Propriva,Female,21,26,True
869,g989,15,53.44202,0,Propriva,Female,21,26,True
950,g989,20,55.326122,1,Propriva,Female,21,26,True
1111,g989,20,54.65765,1,Propriva,Female,21,26,True


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1875,r944,45,41.581521,2,Capomulin,Male,12,25
1876,u364,45,31.023923,3,Capomulin,Male,18,17
1877,p438,45,61.433892,1,Ceftamin,Female,11,26
1878,x773,45,58.634971,4,Placebo,Female,21,30


In [None]:
# Generate summary statistics



In [None]:
# Create bar charts and pie charts

In [None]:
# Calculate quartiles, find outliers, and create a box plot

In [None]:
# Create a line plot and a scatter plot

In [None]:
# Calculate correlation and regression