## Observations and Insights 

In [7]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import sem


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_tests = mouse_metadata.merge(study_results, how='outer', on='Mouse ID')

# Display the data table for preview
mouse_tests.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [19]:
# Checking the number of mice.
mice_number = mouse_tests['Mouse ID'].count()
mice_number





1893

In [17]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

duplicate_mice = mouse_tests.loc[mouse_tests.duplicated(subset=['Mouse ID', 'Timepoint'], keep='first')]                             
duplicate_mice

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [23]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_test_final = mouse_tests.drop_duplicates(subset='Mouse ID', keep='last')
mouse_tests = mouse_tests.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep='first')
mouse_tests.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [25]:
# Checking the number of mice in the clean DataFrame.
mouse_tests['Mouse ID'].count()

1888

## Summary Statistics

In [46]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean = mouse_tests.groupby('Drug Regimen').mean()
tumor_mean = mean['Tumor Volume (mm3)']
median = mouse_tests.groupby('Drug Regimen').median()
tumor_median = median['Tumor Volume (mm3)']
var = mouse_tests.groupby('Drug Regimen').var()
tumor_var = var['Tumor Volume (mm3)']
std = mouse_tests.groupby('Drug Regimen').std()
tumor_std = std['Tumor Volume (mm3)']
sem = mouse_tests.groupby('Drug Regimen').sem()
tumor_sem = sem['Tumor Volume (mm3)']
    
mice_df = pd.DataFrame({
    'Mean': tumor_mean,
    'Median': tumor_median,
    'Variance': tumor_var,
    'Standard Deviation': tumor_std,
    'SEM': tumor_sem
})
mice_df

mouse_tests.groupby('Drug Regimen').describe('T')









# This method is the most straighforward, creating multiple series and putting them all together at the end.



Unnamed: 0_level_0,Age_months,Age_months,Age_months,Age_months,Age_months,Age_months,Age_months,Age_months,Weight (g),Weight (g),...,Tumor Volume (mm3),Tumor Volume (mm3),Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Capomulin,230.0,13.456522,7.721423,1.0,7.0,16.5,20.0,24.0,230.0,19.965217,...,45.0,48.158209,230.0,0.713043,0.848993,0.0,0.0,0.0,1.0,3.0
Ceftamin,178.0,13.247191,8.071406,2.0,6.0,12.0,20.0,24.0,178.0,27.398876,...,56.801438,68.923185,178.0,1.179775,1.184283,0.0,0.0,1.0,2.0,4.0
Infubinol,178.0,16.230337,7.510278,1.0,8.0,20.0,23.0,24.0,178.0,27.196629,...,57.314444,72.226731,178.0,0.960674,1.027104,0.0,0.0,1.0,2.0,4.0
Ketapril,188.0,15.659574,6.01967,1.0,11.75,18.0,19.0,24.0,188.0,27.861702,...,60.870951,78.567014,188.0,1.297872,1.393873,0.0,0.0,1.0,2.0,4.0
Naftisol,186.0,12.0,6.715855,2.0,8.0,9.0,19.0,23.0,186.0,27.166667,...,59.963034,76.668817,186.0,1.182796,1.216519,0.0,0.0,1.0,2.0,4.0
Placebo,181.0,10.734807,6.354907,1.0,5.0,10.0,17.0,21.0,181.0,27.928177,...,59.916934,73.212939,181.0,1.441989,1.338824,0.0,0.0,1.0,2.0,4.0
Propriva,156.0,10.570513,7.188801,1.0,5.0,8.0,16.0,24.0,156.0,27.076923,...,56.491585,72.455421,156.0,1.0,1.08954,0.0,0.0,1.0,1.0,4.0
Ramicane,228.0,10.684211,5.946629,1.0,7.0,9.0,18.0,23.0,228.0,19.679825,...,45.0,47.622816,228.0,0.548246,0.691259,0.0,0.0,0.0,1.0,3.0
Stelasyn,181.0,12.78453,7.939562,1.0,4.0,14.0,21.0,23.0,181.0,27.856354,...,58.719297,75.12369,181.0,0.872928,0.972046,0.0,0.0,1.0,1.0,4.0
Zoniferol,182.0,12.598901,5.786114,2.0,8.0,12.5,16.0,24.0,182.0,27.692308,...,57.954259,73.324432,182.0,1.230769,1.248884,0.0,0.0,1.0,2.0,4.0


In [29]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
mouse_tests.groupby('Drug Regimen').agg(
    {
        'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std', 'sem']
    })

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
mouse_tests['Drug Regimen'].value_counts().plot.bar()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x = mouse_tests['Drug Regimen'].unique()
y = mouse_tests['Drug Regimen'].value_counts()
x
plt.bar(x, y)



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
    
mouse_tests['Sex'].value_counts().plot.pie()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin


# Start by getting the last (greatest) timepoint for each mouse
mouse_test_final

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
cap = mouse_test_final.loc[mouse_test_final['Drug Regimen'] == 'Capomulin']
ram = mouse_test_final.loc[mouse_test_final['Drug Regimen'] == 'Ramicane']
inf = mouse_test_final.loc[mouse_test_final['Drug Regimen'] == 'Infubinol']
cef = mouse_test_final.loc[mouse_test_final['Drug Regimen'] == 'Ceftamin']
treatments_df = pd.concat([cap, ram, inf, cef])
treatments_df


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = treatments_df['Drug Regimen'].unique()
treatments

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

quartiles = treatments_df['Tumor Volume (mm3)'].quantile([.25,.5,.75])
quartiles

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
treatments_df.boxplot(column='Tumor Volume (mm3)', by='Drug Regimen')


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

mouse_tests.loc[mouse_tests['Drug Regimen'] == 'Capomulin']
sampleMouse = mouse_tests.loc[mouse_tests['Mouse ID'] == 'l509']
plt.plot(sampleMouse['Timepoint'], sampleMouse['Tumor Volume (mm3)'])

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capx = mouse_tests.loc[mouse_tests['Drug Regimen'] == 'Capomulin']
cap_mean = capx.groupby('Mouse ID').mean()
plot = cap_mean.plot.scatter('Weight (g)', 'Tumor Volume (mm3)')

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
