## Observations and Insights 

In [31]:
# Dependencies and Setup
import matplotlib.pyplot as plt 
import scipy.stats as st
import pandas as pd

# Est File Paths
mouse_metadata_csv = "data/Mouse_metadata.csv"
study_results_csv = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_csv)
study_results = pd.read_csv(study_results_csv)


In [32]:
#Preview 
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [33]:
#Preview
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [39]:
# Combine the data into a single dataset
# Display the data table for preview
merge_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")
merge_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [44]:
# Sorting the DataFrame by Mouse ID and Timepoint

merge_df.sort_values(by=['Mouse ID', 'Timepoint'], inplace=True)
print(merge_df)

     Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
454      a203    Infubinol  Female          20          23          0   
455      a203    Infubinol  Female          20          23          5   
456      a203    Infubinol  Female          20          23         10   
457      a203    Infubinol  Female          20          23         15   
458      a203    Infubinol  Female          20          23         20   
...       ...          ...     ...         ...         ...        ...   
1888     z969     Naftisol    Male           9          30         25   
1889     z969     Naftisol    Male           9          30         30   
1890     z969     Naftisol    Male           9          30         35   
1891     z969     Naftisol    Male           9          30         40   
1892     z969     Naftisol    Male           9          30         45   

      Tumor Volume (mm3)  Metastatic Sites  
454            45.000000                 0  
455            48.508468         

In [52]:
# Checking the number of mice.
number_of_mice = merge_df["Mouse ID"].nunique()
print("Number of Mouse IDs :", number_of_mice)

Number of Mouse IDs : 249


In [55]:
# Group by Mouse ID and Timepoint
grouped_merge_df = merge_df.groupby(["Mouse ID", "Timepoint"])
grouped_merge_df.count().head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Drug Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a203,0,1,1,1,1,1,1
a203,5,1,1,1,1,1,1
a203,10,1,1,1,1,1,1
a203,15,1,1,1,1,1,1
a203,20,1,1,1,1,1,1


In [62]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
del_dupes_df = pd.DataFrame(
    grouped_merge_df.sum())
del_dupes_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a203,0,20,23,45.0,0
a203,5,20,23,48.508468,0
a203,10,20,23,51.852437,1
a203,15,20,23,52.77787,1
a203,20,20,23,55.173336,1
a203,25,20,23,56.793208,1
a203,30,20,23,59.523197,1
a203,35,20,23,61.93165,2
a203,40,20,23,63.59349,2
a203,45,20,23,67.973419,2


In [72]:
del_dupes_df.duplicated().sum()

97

In [74]:
del_dupes_df.loc[del_dupes_df.duplicated(keep='first'), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c282,0,12,27,45.0,0
c819,0,21,25,45.0,0
c927,0,4,26,45.0,0
d133,0,5,30,45.0,0
e584,0,9,27,45.0,0
...,...,...,...,...,...
z234,0,19,27,45.0,0
z314,0,21,28,45.0,0
z578,0,11,16,45.0,0
z581,0,24,25,45.0,0


In [79]:
# Optional: Get all the data for the duplicate mouse ID. 



In [81]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
del_dupes_df.drop_duplicates(keep='first').shape
del_dupes_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
Mouse ID,Timepoint,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a203,0,20,23,45.0,0
a203,5,20,23,48.508468,0
a203,10,20,23,51.852437,1
a203,15,20,23,52.77787,1
a203,20,20,23,55.173336,1


In [82]:
# Checking the number of mice in the clean DataFrame.
number_of_mice2 = del_dupes_df["Mouse ID"].nunique()
print("Number of Mouse IDs :", number_of_mice2)

KeyError: 'Mouse ID'

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.



In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
