## Observations and Insights 

In [204]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path, escapechar='\\')
study_results = pd.read_csv(study_results_path, escapechar='\\')



# Combine the data into a single dataset
combined_metadata_studyresults = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
combined_metadata_studyresults

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [205]:
# Checking the number of mice.
total_mice_df = combined_metadata_studyresults["Mouse ID"].count()
data = [total_mice_df] 
total_mice_df= pd.DataFrame(data, columns = ["Total Mice"])
total_mice_df

Unnamed: 0,Total Mice
0,1893


In [206]:
# Optional: Get all the data for the duplicate mouse ID. 
find_duplicates_df = combined_metadata_studyresults[combined_metadata_studyresults.duplicated(["Mouse ID", "Timepoint"])]
find_duplicates_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [147]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_data_df = combined_metadata_studyresults.drop_duplicates(subset=['Mouse ID', 'Timepoint'])
cleaned_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [208]:
# Checking the number of mice in the clean DataFrame.
revised_total_mice_df = cleaned_data_df["Mouse_ID"].count()
data = [revised_total_mice_df] 
revised_total_mice_df= pd.DataFrame(data, columns = ["Total Mice"])
revised_total_mice_df

Unnamed: 0,Total Mice
0,1888


## Summary Statistics

In [209]:
cleaned_data_df.rename(columns = {'Tumor Volume (mm3)':'Tumor_Volume'}, inplace=True)
cleaned_data_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Mouse_ID,Drug_Regimen,sex,Age_months,Weight_(g),Timepoint,Tumor_Volume,Metastatic_Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [210]:
cleaned_data_df.columns = [c.replace(' ', '_') for c in cleaned_data_df.columns]


In [216]:
cleaned_data_df.dtypes


Mouse_ID             object
Drug_Regimen         object
sex                  object
Age_months            int64
Weight_(g)            int64
Timepoint             int64
Tumor_Volume        float64
Metastatic_Sites      int64
dtype: object

In [211]:
# Mean Tumor Volume
mean_grouped_drug_regimen_df = cleaned_data_df.groupby('Drug_Regimen', as_index=False).Tumor_Volume.mean()
mean_grouped_drug_regimen_df 

Unnamed: 0,Drug_Regimen,Tumor_Volume
0,Capomulin,40.675741
1,Ceftamin,52.591172
2,Infubinol,52.884795
3,Ketapril,55.235638
4,Naftisol,54.331565
5,Placebo,54.033581
6,Propriva,52.393463
7,Ramicane,40.216745
8,Stelasyn,54.233149
9,Zoniferol,53.236507


In [212]:
# Median Tumor Volume
median_grouped_drug_regimen_df = cleaned_data_df.groupby('Drug_Regimen', as_index=False).Tumor_Volume.median()
median_grouped_drug_regimen_df 

Unnamed: 0,Drug_Regimen,Tumor_Volume
0,Capomulin,41.557809
1,Ceftamin,51.776157
2,Infubinol,51.820584
3,Ketapril,53.698743
4,Naftisol,52.509285
5,Placebo,52.288934
6,Propriva,50.909965
7,Ramicane,40.673236
8,Stelasyn,52.431737
9,Zoniferol,51.818479


In [213]:
# Variance Tumor Volume
var_grouped_drug_regimen_df = cleaned_data_df.groupby('Drug_Regimen', as_index=False).Tumor_Volume.var()
var_grouped_drug_regimen_df 

Unnamed: 0,Drug_Regimen,Tumor_Volume
0,Capomulin,24.947764
1,Ceftamin,39.290177
2,Infubinol,43.128684
3,Ketapril,68.553577
4,Naftisol,66.173479
5,Placebo,61.168083
6,Propriva,43.138803
7,Ramicane,23.486704
8,Stelasyn,59.450562
9,Zoniferol,48.533355


In [223]:
# SD Tumor Volume
std_grouped_drug_regimen_df = cleaned_data_df.groupby('Drug_Regimen', as_index=False).Tumor_Volume.std()
std_grouped_drug_regimen_df 

TypeError: loop of ufunc does not support argument 0 of type str which has no callable sqrt method

In [215]:
# SEM Tumor Volume
sem_grouped_drug_regimen_df = cleaned_data_df.groupby('Drug_Regimen', as_index=False).Tumor_Volume.sem()
sem_grouped_drug_regimen_df 

TypeError: loop of ufunc does not support argument 0 of type str which has no callable sqrt method

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [224]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
total_mice_regimen_df = cleaned_data_df.groupby('Drug_Regimen', as_index=False).Mouse_ID.value_count()
total_mice_regimen_df 


AttributeError: 'DataFrameGroupBy' object has no attribute 'value_count'

In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
