## Observations and Insights 

In [38]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata_path = '/Users/jacobcortez/Documents/GitHub/DS_Repos/HW_Repo/mapplotlib_challenge/Mouse_metadata.csv'
study_results_path = '/Users/jacobcortez/Documents/GitHub/DS_Repos/HW_Repo/mapplotlib_challenge/Study_results.csv'

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [39]:
# Preview mouse data
mouse_metadata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [40]:
# Preview study data
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [41]:
# Combine the data into a single dataset (interjoin)
merge_df = pd.merge(mouse_metadata, study_results, how = 'right',on="Mouse ID")

# Display the data table for preview
merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,Capomulin,Female,9,22,0,45.000000,0
1,f932,Ketapril,Male,15,29,0,45.000000,0
2,g107,Ketapril,Female,2,29,0,45.000000,0
3,a457,Ketapril,Female,11,30,0,45.000000,0
4,c819,Ketapril,Male,21,25,0,45.000000,0
...,...,...,...,...,...,...,...,...
1888,r944,Capomulin,Male,12,25,45,41.581521,2
1889,u364,Capomulin,Male,18,17,45,31.023923,3
1890,p438,Ceftamin,Female,11,26,45,61.433892,1
1891,x773,Placebo,Female,21,30,45,58.634971,4


In [42]:
# Check for incomplete rows
merge_df.count()

Mouse ID              1893
Drug Regimen          1893
Sex                   1893
Age_months            1893
Weight (g)            1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64

In [43]:
# Check for types in each row or column
merge_df.dtypes

Mouse ID               object
Drug Regimen           object
Sex                    object
Age_months              int64
Weight (g)              int64
Timepoint               int64
Tumor Volume (mm3)    float64
Metastatic Sites        int64
dtype: object

In [44]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merge_df.duplicated(subset=['Mouse ID','Timepoint'])


0       False
1       False
2       False
3       False
4       False
        ...  
1888    False
1889    False
1890    False
1891    False
1892    False
Length: 1893, dtype: bool

In [45]:
# Optional: Get all the data for the duplicate mouse ID. 
merge_duplicates_df = merge_df.loc[merge_df.duplicated(subset=['Mouse ID','Timepoint'])]
merge_duplicates_df.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
137,g989,Propriva,Female,21,26,0,45.0,0
360,g989,Propriva,Female,21,26,5,47.570392,0
681,g989,Propriva,Female,21,26,10,49.880528,0
869,g989,Propriva,Female,21,26,15,53.44202,0
1111,g989,Propriva,Female,21,26,20,54.65765,1


In [46]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
merge_df2 = merge_df.drop_duplicates(subset=['Mouse ID','Timepoint'])
merge_df2

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,Capomulin,Female,9,22,0,45.000000,0
1,f932,Ketapril,Male,15,29,0,45.000000,0
2,g107,Ketapril,Female,2,29,0,45.000000,0
3,a457,Ketapril,Female,11,30,0,45.000000,0
4,c819,Ketapril,Male,21,25,0,45.000000,0
...,...,...,...,...,...,...,...,...
1888,r944,Capomulin,Male,12,25,45,41.581521,2
1889,u364,Capomulin,Male,18,17,45,31.023923,3
1890,p438,Ceftamin,Female,11,26,45,61.433892,1
1891,x773,Placebo,Female,21,30,45,58.634971,4


In [47]:
# Checking the number of mice in the clean DataFrame.
mice_count_nunique2 = merge_df2[['Mouse ID','Timepoint']].nunique()
mice_count_nunique2

Mouse ID     249
Timepoint     10
dtype: int64

## Summary Statistics

In [48]:
# Create DF for Drug Regimen
summary_stats_df = merge_df2.groupby('Drug Regimen')
summary_stats_df.describe()

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen.
# mean, median, variance, standard deviation, and SEM of the tumor volume
tumor_volume_mean = summary_stats_df['Tumor Volume (mm3)'].mean()
tumor_volume_median = summary_stats_df['Tumor Volume (mm3)'].median()
tumor_volume_variance = summary_stats_df['Tumor Volume (mm3)'].var()
tumor_volume_std = summary_stats_df['Tumor Volume (mm3)'].std()
tumor_volume_sem = summary_stats_df['Tumor Volume (mm3)'].sem()


# Assemble the resulting series into a single summary dataframe.
tumor_volume_stats_df = pd.DataFrame({'Mean': tumor_volume_mean,'Median': tumor_volume_median,'Variance': tumor_volume_variance,'Standard Deviation': tumor_volume_std,'SEM': tumor_volume_sem})

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_volume_stats_df

# Using the aggregation method, produce the same summary statistics in a single line
tumor_volume_stats_df.agg('mean')




Mean                  50.983236
Median                49.948493
Variance              47.787119
Standard Deviation     6.815695
SEM                    0.500938
dtype: float64

## Bar and Pie Charts

In [50]:
# Load DF clean data
merge_df2.head()

# Set x axis and tick locations
x_axis = np.arange(len(merge_df2))
tick_locations = [value+0.4 for value in x_axis]

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
