## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata_df = pd.read_csv(mouse_metadata_path)
study_results_df = pd.read_csv(study_results_path)

In [2]:
# Combine the data into a single dataset
combined_mice_df = pd.merge(mouse_metadata_df, study_results_df, how= 'outer', on= "Mouse ID")

# Display the data table for preview
combined_mice_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [6]:
mouse_id = combined_mice_df["Mouse ID"].unique()
mouse_id

array(['k403', 's185', 'x401', 'm601', 'g791', 's508', 'f966', 'm546',
       'z578', 'j913', 'u364', 'n364', 'y793', 'r554', 'm957', 'c758',
       't565', 'a644', 'i177', 'j989', 'i738', 'a520', 'w914', 'r811',
       'g288', 'i334', 'q610', 'd251', 'l897', 'c458', 'b742', 'b128',
       'j246', 'a411', 'j119', 'w150', 'v923', 'g316', 's710', 'l509',
       'r944', 'e662', 'u196', 'q597', 'a444', 'i557', 'r921', 'w678',
       'y449', 'a203', 'a251', 'a262', 'a275', 'a366', 'a401', 'a457',
       'a492', 'a577', 'a685', 'a699', 'a788', 'a818', 'a897', 'a963',
       'b313', 'b447', 'b487', 'b559', 'b759', 'b879', 'c139', 'c264',
       'c282', 'c302', 'c326', 'c402', 'c559', 'c580', 'c757', 'c766',
       'c819', 'c832', 'c895', 'c927', 'd133', 'd164', 'd474', 'e213',
       'e227', 'e291', 'e476', 'e584', 'f129', 'f234', 'f278', 'f345',
       'f394', 'f436', 'f545', 'f932', 'f993', 'g107', 'g296', 'g497',
       'g558', 'g570', 'g867', 'g989', 'h246', 'h333', 'h428', 'h531',
      

In [3]:
# Checking the number of mice.
combined_mice_df["Mouse ID"].count()

1893

In [4]:
combined_mice_df = combined_mice_df.rename(columns={"Age_months": "Age Months"})
combined_mice_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age Months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [49]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
mice_df = combined_mice_df[["Mouse ID", "Timepoint"]]
mice_df

Unnamed: 0,Mouse ID,Timepoint
0,k403,0
1,k403,5
2,k403,10
3,k403,15
4,k403,20
...,...,...
1888,z969,25
1889,z969,30
1890,z969,35
1891,z969,40


In [41]:
# Optional: Get all the data for the duplicate mouse ID. 
mice_df = combined_mice_df.groupby(["Mouse ID", "Timepoint", "Metastatic Sites", "Tumor Volume (mm3)", "Sex",
                                   "Age Months", "Weight (g)"])["Drug Regimen"]
all_mice_df = mice_df.count()
all_mice_df

Mouse ID  Timepoint  Metastatic Sites  Tumor Volume (mm3)  Sex     Age Months  Weight (g)
a203      0          0                 45.000000           Female  20          23            1
          5          0                 48.508468           Female  20          23            1
          10         1                 51.852437           Female  20          23            1
          15         1                 52.777870           Female  20          23            1
          20         1                 55.173336           Female  20          23            1
                                                                                            ..
z969      25         2                 63.145652           Male    9           30            1
          30         3                 65.841013           Male    9           30            1
          35         4                 69.176246           Male    9           30            1
          40         4                 70.314904       

In [53]:
chemo_drug_given = combined_mice_df.set_index("Drug Regimen")
chemo_drug_given

Unnamed: 0_level_0,Mouse ID,Sex,Age Months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ramicane,k403,Male,21,16,0,45.000000,0
Ramicane,k403,Male,21,16,5,38.825898,0
Ramicane,k403,Male,21,16,10,35.014271,1
Ramicane,k403,Male,21,16,15,34.223992,1
Ramicane,k403,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...
Naftisol,z969,Male,9,30,25,63.145652,2
Naftisol,z969,Male,9,30,30,65.841013,3
Naftisol,z969,Male,9,30,35,69.176246,4
Naftisol,z969,Male,9,30,40,70.314904,4


In [48]:
combined_mice_df["Mouse ID"].value_counts()

g989    13
b559    10
j119    10
s185    10
l733    10
        ..
o848     1
t573     1
v199     1
u153     1
x226     1
Name: Mouse ID, Length: 249, dtype: int64

In [None]:
clean_combined_mice_df = combined_mice_df.drop(how="Mouse ID")
clean_combined_mice_df.count()

In [54]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
chemo_drug_given.drop(columns= "Mouse ID")

Unnamed: 0_level_0,Sex,Age Months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ramicane,Male,21,16,0,45.000000,0
Ramicane,Male,21,16,5,38.825898,0
Ramicane,Male,21,16,10,35.014271,1
Ramicane,Male,21,16,15,34.223992,1
Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...
Naftisol,Male,9,30,25,63.145652,2
Naftisol,Male,9,30,30,65.841013,3
Naftisol,Male,9,30,35,69.176246,4
Naftisol,Male,9,30,40,70.314904,4


In [59]:
# Checking the number of mice in the clean DataFrame.
chemo_drug_given["Mouse ID"].drop_duplicates().count()

249

## Summary Statistics

In [81]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.
chemo_drug_given.describe(include="all")

Unnamed: 0,Mouse ID,Sex,Age Months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,1893,1893,1893.0,1893.0,1893.0,1893.0,1893.0
unique,249,2,,,,,
top,g989,Male,,,,,
freq,13,958,,,,,
mean,,,12.81458,25.662441,19.572108,50.448381,1.021659
std,,,7.189592,3.921622,14.07946,8.894722,1.137974
min,,,1.0,15.0,0.0,22.050126,0.0
25%,,,7.0,25.0,5.0,45.0,0.0
50%,,,13.0,27.0,20.0,48.951474,1.0
75%,,,20.0,29.0,30.0,56.2922,2.0


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function


In [20]:
tumor_volume_df = combined_mice_df.groupby(["Drug Regimen", "Timepoint"])["Tumor Volume (mm3)"]
avg_tumor_volume_df = tumor_volume_df.mean()
Average_tumor_volume = pd.DataFrame(avg_tumor_volume_df)
Average_tumor_volume

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3)
Drug Regimen,Timepoint,Unnamed: 2_level_1
Capomulin,0,45.000000
Capomulin,5,44.266086
Capomulin,10,43.084291
Capomulin,15,42.064317
Capomulin,20,40.716325
...,...,...
Zoniferol,25,55.432935
Zoniferol,30,57.713531
Zoniferol,35,60.089372
Zoniferol,40,62.916692


In [21]:
tumor_volume_df = combined_mice_df.groupby(["Drug Regimen", "Timepoint"])["Tumor Volume (mm3)"]
med_tumor_volume_df = tumor_volume_df.median()
Median_tumor_volume = pd.DataFrame(med_tumor_volume_df)
Median_tumor_volume

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3)
Drug Regimen,Timepoint,Unnamed: 2_level_1
Capomulin,0,45.000000
Capomulin,5,45.597064
Capomulin,10,43.421014
Capomulin,15,42.798160
Capomulin,20,40.716428
...,...,...
Zoniferol,25,55.676604
Zoniferol,30,57.419744
Zoniferol,35,60.365832
Zoniferol,40,62.274543


In [22]:
tumor_volume_df = combined_mice_df.groupby(["Drug Regimen", "Timepoint"])["Tumor Volume (mm3)"]
var_tumor_volume_df = tumor_volume_df.var()
variance_tumor_volume = pd.DataFrame(var_tumor_volume_df)
variance_tumor_volume

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3)
Drug Regimen,Timepoint,Unnamed: 2_level_1
Capomulin,0,0.000000
Capomulin,5,5.030889
Capomulin,10,12.344133
Capomulin,15,16.878693
Capomulin,20,19.035028
...,...,...
Zoniferol,25,5.808348
Zoniferol,30,9.601024
Zoniferol,35,10.876760
Zoniferol,40,13.958456


In [23]:
tumor_volume_df = combined_mice_df.groupby(["Drug Regimen", "Timepoint"])["Tumor Volume (mm3)"]
std_tumor_volume_df = tumor_volume_df.std()
stdeviation_tumor_volume = pd.DataFrame(std_tumor_volume_df)
stdeviation_tumor_volume

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3)
Drug Regimen,Timepoint,Unnamed: 2_level_1
Capomulin,0,0.000000
Capomulin,5,2.242964
Capomulin,10,3.513422
Capomulin,15,4.108369
Capomulin,20,4.362915
...,...,...
Zoniferol,25,2.410052
Zoniferol,30,3.098552
Zoniferol,35,3.297993
Zoniferol,40,3.736102


In [24]:
tumor_volume_df = combined_mice_df.groupby(["Drug Regimen", "Timepoint"])["Tumor Volume (mm3)"]
sem_tumor_volume_df = tumor_volume_df.sem()
stderror_tumor_volume = pd.DataFrame(sem_tumor_volume_df)
stderror_tumor_volume

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume (mm3)
Drug Regimen,Timepoint,Unnamed: 2_level_1
Capomulin,0,0.000000
Capomulin,5,0.448593
Capomulin,10,0.702684
Capomulin,15,0.838617
Capomulin,20,0.909731
...,...,...
Zoniferol,25,0.602513
Zoniferol,30,0.800043
Zoniferol,35,0.881426
Zoniferol,40,0.998515


## Bar and Pie Charts

In [25]:
chemo_drug = combined_mice_df["Drug Regimen"].unique()
chemo_drug

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [26]:
max_tumor_vol = combined_mice_df["Tumor Volume (mm3)"].max()
max_tumor_vol

78.56701362

In [27]:
min_tumor_vol = combined_mice_df["Tumor Volume (mm3)"].min()
min_tumor_vol

22.05012627

In [None]:
Ramicane = "Ramicane"
Ramicane_results = combined_mice_df.loc[Ramicane]
Ramicane_list = Ramicane_results

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
