In [462]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from statistics import variance
import statistics
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# print(study_results.columns)
# print(mouse_metadata.columns)

# Combine the data into a single DataFrame
pymaceuticalsMerge_df = pd.merge(study_results,mouse_metadata, how="left", on=["Mouse ID", "Mouse ID"])
pymaceuticalsMerge_df["Tumor Volume (mm3)"] = pymaceuticalsMerge_df["Tumor Volume (mm3)"].map("{:.1f}".format)
pymaceuticalsMerge_df = pd.DataFrame(pymaceuticalsMerge_df)

# unique = pymaceuticalsMerge_df["Mouse ID"].unique()
# unique
# print(groupedPymaceuticalsMerge_df)
# groupedPymaceuticalsMerge_df.head().unique()
# groupedPymaceuticalsMerge_df.count().head()

# Display the data table for preview
pymaceuticalsMerge_df.head()



Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [463]:
# Checking the number of mice.
miceNumber = pymaceuticalsMerge_df["Mouse ID"].value_counts().count()
miceNumber

249

In [464]:
# Our data should be uniquely identified by Mouse ID and Timepoint
mouseIdTimepointpymaceuticals = pymaceuticalsMerge_df.groupby(["Mouse ID","Timepoint"])
mouseIdTimepointpymaceuticals.head()
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicateData_df = pymaceuticalsMerge_df[pymaceuticalsMerge_df[['Mouse ID', 'Timepoint']].duplicated() == True]
miceId = duplicateData_df["Mouse ID"].unique()
miceId

array(['g989'], dtype=object)

In [465]:
# Optional: Get all the data for the duplicate mouse ID. 
# Get the mice Id
duplicatedMiceId = miceId[0]
# Get all info mice Id duplicated
miceDuplicated_allInfo = pymaceuticalsMerge_df[pymaceuticalsMerge_df.loc[:,"Mouse ID"] ==  duplicatedMiceId]
miceDuplicated_allInfo


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.8,0,Propriva,Female,21,26
360,g989,5,47.6,0,Propriva,Female,21,26
620,g989,10,51.7,0,Propriva,Female,21,26
681,g989,10,49.9,0,Propriva,Female,21,26
815,g989,15,51.3,1,Propriva,Female,21,26
869,g989,15,53.4,0,Propriva,Female,21,26
950,g989,20,55.3,1,Propriva,Female,21,26
1111,g989,20,54.7,1,Propriva,Female,21,26


In [466]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# print(duplicatedMiceId)
clean_pymaceuticalsMerge_df = pymaceuticalsMerge_df
# new df with clean data
clean_pymaceuticalsMerge_df = clean_pymaceuticalsMerge_df.loc[(clean_pymaceuticalsMerge_df["Mouse ID"] != duplicatedMiceId)]
clean_pymaceuticalsMerge_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


In [467]:
# Checking the number of mice in the clean DataFrame.
miceNumber = clean_pymaceuticalsMerge_df["Mouse ID"].value_counts().count()
print(clean_pymaceuticalsMerge_df.columns)
miceNumber
 

Index(['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites',
       'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)'],
      dtype='object')


248

In [468]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 



clean_pymaceuticalsMerge_df = clean_pymaceuticalsMerge_df.astype({"Tumor Volume (mm3)": float})
grouped_clean_pymaceuticals = clean_pymaceuticalsMerge_df.groupby(["Drug Regimen"])
# print(grouped_clean_pymaceuticals)
grouped_clean_pymaceuticals.count().head(10)

meanValue =grouped_clean_pymaceuticals[["Tumor Volume (mm3)"]].mean()
medianValue = grouped_clean_pymaceuticals[["Tumor Volume (mm3)"]].median()

# Assemble the resulting series into a single summary DataFrame.
sumaryPymaceuticals_df = pd.merge(meanValue,medianValue, how="left", on=["Drug Regimen"])
sumaryPymaceuticals_df.rename(columns={"Tumor Volume (mm3)_x": "Mean Tumor Volume"})
sumaryPymaceuticals_df.head(10)

Unnamed: 0_level_0,Tumor Volume (mm3)_x,Tumor Volume (mm3)_y
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1
Capomulin,40.678261,41.55
Ceftamin,52.589326,51.8
Infubinol,52.885955,51.8
Ketapril,55.237766,53.7
Naftisol,54.330108,52.5
Placebo,54.032044,52.3
Propriva,52.320946,50.45
Ramicane,40.217982,40.7
Stelasyn,54.235359,52.4
Zoniferol,53.236813,51.8


In [470]:
# x_axis = np.arange(len(users))
# plt.bar(x_axis, users, color='r', alpha=0.5, align="center")
# plt.show()