In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sts
import numpy as np

In [5]:
# read into pandas
mouse_df = pd.read_csv('Resources/Study_results.csv')
mouse_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [9]:
# removing duplicate mouse IDs with duplicate timepoint, while keeping the highest tumor size at that timespot.
mouse_clean = mouse_df.sort_values('Tumor Volume (mm3)',ascending=False).drop_duplicates(subset=['Mouse ID', 'Timepoint'])
mouse_clean.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
1778,o331,45,78.567014,4
1861,l725,45,76.668817,3
1774,p189,45,75.294936,4
1830,m269,45,75.12369,1
1853,t724,45,75.113288,2


In [15]:
# check if duplicate value was eliminated (YES!)
finder_df = mouse_clean.loc[mouse_clean['Mouse ID']=="g989"]
finder_df

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
1592,g989,35,62.57088,2
1380,g989,30,59.082294,1
1195,g989,25,56.045564,1
950,g989,20,55.326122,1
869,g989,15,53.44202,0
620,g989,10,51.745156,0
329,g989,5,48.786801,0
107,g989,0,45.0,0


In [19]:
# import second dataset and merge with mouse_clean df
mouse_metadata = pd.read_csv('Resources/Mouse_metadata.csv')
mouse_merged = pd.merge(mouse_clean, mouse_metadata,
                                 how='outer', on='Mouse ID')
mouse_merged.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,o331,45,78.567014,4,Ketapril,Male,24,30
1,o331,40,71.447743,3,Ketapril,Male,24,30
2,o331,35,70.126238,2,Ketapril,Male,24,30
3,o331,30,66.330663,1,Ketapril,Male,24,30
4,o331,25,61.102306,1,Ketapril,Male,24,30


In [26]:
# Group by regimen to find the mean, median, variance, standard deviation, and SEM of the tumor volume for each drug regimen

# groupby drug regimen
mouse_regimen = mouse_merged.groupby('Drug Regimen')

# creating each statistic analysis
mouse_regimen_mean = mouse_regimen.mean()['Tumor Volume (mm3)']
mouse_regimen_median = mouse_regimen.median()['Tumor Volume (mm3)']
mouse_regimen_var = mouse_regimen.var()['Tumor Volume (mm3)']
mouse_regimen_stdev = mouse_regimen.std()['Tumor Volume (mm3)']
mouse_regimen_sem = mouse_regimen.sem()['Tumor Volume (mm3)']

# Regimen summary
mouse_regimen_summary = pd.DataFrame({"Mean": mouse_regimen_mean,"Median":mouse_regimen_median,"Variance":mouse_regimen_var,"Stdev":mouse_regimen_stdev,"SEM":mouse_regimen_sem})
mouse_regimen_summary

Unnamed: 0_level_0,Mean,Median,Variance,Stdev,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.407029,50.909965,43.138358,6.56798,0.525859
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [None]:
# bar plot using both Pandas's `DataFrame.plot()` showing total number of measurements taken for each regimen
plt.figure(figsize=(20,3))
plt.bar(x_axis, rain_df["Inches"], color='r', alpha=0.5, align="center")
plt.xticks(tick_locations, rain_df["State"], rotation="vertical")

In [None]:
# bar plot using Pandas's  Matplotlib's `pyplot` showing total number of measurements taken for each regimen