## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset (1893)
combined_data_df = pd.merge(mouse_metadata, study_results, on='Mouse ID')
# Display the data table for preview
combined_data_df
#mouse_metadata #249
#study_results #1893

In [None]:
# Checking the number of mice.
len(combined_data_df["Mouse ID"].unique())

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
combined_data_df_dup = combined_data_df[combined_data_df.duplicated(subset=["Mouse ID","Timepoint"], keep=False)]
combined_data_df_dup["Mouse ID"].drop_duplicates()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
#duplicate mouse df
combined_data_df_dup


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# No duplicate data frame would suppress mouse id found in the duplicate mouse df
combined_data_df_no_dup = combined_data_df.loc[(~combined_data_df["Mouse ID"].isin(combined_data_df_dup["Mouse ID"]))]
combined_data_df_no_dup

In [None]:
# Checking the number of mice in the clean DataFrame.
len(combined_data_df_no_dup["Mouse ID"].unique())

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
stat1 = combined_data_df_no_dup.groupby(["Drug Regimen"])
stat1["Tumor Volume (mm3)"].mean()
stat1["Tumor Volume (mm3)"].median()
stat1["Tumor Volume (mm3)"].var()
stat1["Tumor Volume (mm3)"].std()
stat1["Tumor Volume (mm3)"].sem()
summaryStat = pd.DataFrame({"Mean":stat1["Tumor Volume (mm3)"].mean(),"Median":stat1["Tumor Volume (mm3)"].median(),"Variance":stat1["Tumor Volume (mm3)"].var(),"Standard Deviation":stat1["Tumor Volume (mm3)"].std(),"SEM":stat1["Tumor Volume (mm3)"].sem()})
summaryStat
summaryStat.rename_axis("")
# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function
# Renaming aggregate columns using dictionary unpacking
combined_data_df_no_dup.groupby(["Drug Regimen"]).agg(**{'Mean':("Tumor Volume (mm3)","mean"),'Median':("Tumor Volume (mm3)","median"),'Variance':("Tumor Volume (mm3)","var"),'Standard Deviation':("Tumor Volume (mm3)","std"),'SEM':("Tumor Volume (mm3)","sem")})


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
x_axis = np.arange(len(stat1["Mouse ID"]))
stat1["Mouse ID"].count()
total_mice_plot = stat1["Mouse ID"].count().plot(kind="bar", figsize=(6,3))
plt.title("Total Mices Vs Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mices")
plt.xlim(-0.75, len(x_axis) -.25)
plt.ylim(0, max(stat1["Mouse ID"].count()) + 10)
plt.tight_layout()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
# Set x axis and tick locations
# Tell matplotlib where we would like to place each of our x axis headers
x_axis = np.arange(len(stat1["Mouse ID"]))
plt.bar(x_axis, stat1["Mouse ID"].count(), color='r', alpha=0.5, align="center")
plt.xticks(x_axis, stat1["Drug Regimen"].max(), rotation="vertical")

plt.title("Total Mices Vs Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mices")

plt.xlim(-0.75, len(x_axis) -.25)
plt.ylim(0, max(stat1["Mouse ID"].count()) + 10)
plt.tight_layout()
plt.show

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mice_group = combined_data_df_no_dup.groupby(["Sex"])
mice_group["Mouse ID"].count() 

mice_sex_pie = mice_group["Mouse ID"].count().plot(kind="pie",autopct="%1.1f%%",title="Female Vs Male Mice DIstribution",legend=True)
mice_sex_pie.set_ylabel("")
plt.legend(mice_group.Sex.max(),loc="best")
plt.tight_layout()
plt.axis("equal")
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
plt.pie(mice_group["Mouse ID"].count(),autopct="%1.1f%%")
mice_sex_pie.set_ylabel("")
plt.legend(mice_group.Sex.max(),loc="best")
plt.title("Female Vs Male Mice DIstribution")
plt.tight_layout()
plt.axis("equal")
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Group by Drug Regimen and Mouse ID and only interested in the maximum Timepoint row
finalTimePointPerMouseDrug = combined_data_df_no_dup[combined_data_df_no_dup["Drug Regimen"].isin(["Capomulin","Ramicane","Infubinol","Ceftamin"])].groupby(["Drug Regimen","Mouse ID"])["Timepoint"].max()

finalTimePointPerMouseDrug
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
lastTimePoint = pd.merge(combined_data_df_no_dup,finalTimePointPerMouseDrug,on=["Drug Regimen","Mouse ID","Timepoint"])
lastTimePoint

In [33]:
# Put treatments into a list for for loop (and later for plot labels)
treatmentList = list(lastTimePoint["Drug Regimen"].unique())

cap = lastTimePoint[lastTimePoint["Drug Regimen"]=="Capomulin"]["Tumor Volume (mm3)"]
ram = lastTimePoint[lastTimePoint["Drug Regimen"]=="Ramicane"]["Tumor Volume (mm3)"]
inf = lastTimePoint[lastTimePoint["Drug Regimen"]=="Infubinol"]["Tumor Volume (mm3)"]
cef = lastTimePoint[lastTimePoint["Drug Regimen"]=="Ceftamin"]["Tumor Volume (mm3)"]

cquartiles = cap.quantile([.25,.5,.75])
lowerq = cquartiles[0.25]
upperq = cquartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of tumor volumes is: {lowerq}")
print(f"The upper quartile of tumor volumes is: {upperq}")
print(f"The interquartile range of tumor volumess is: {iqr}")
print(f"The the median of tumor volumes is: {cquartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")



# Create empty list to fill with tumor vol data (for plotting)
#tumorVolData = [[cap],[ram],[inf],[cef]]

#tumorVolData
#for drug in lastTimePoint:
#    print(drug["Drug Rergimen"])



# Calculate the IQR and quantitatively determine if there are any potential outliers. 
#quartiles = tumorVolData.quantile([.25,.5,.75])

#lowerq = quartiles[0.25]
#upperq = quartiles[0.75]
#iqr = upperq-lowerq

# Locate the rows which contain mice on each drug and get the tumor volumes
    
    
# add subset 
    
    
# Determine outliers using upper and lower bounds


#print(f"The lower quartile of tumor volumes is: {lowerq}")
#print(f"The upper quartile of tumor volumes is: {upperq}")
#print(f"The interquartile range of tumor volumess is: {iqr}")
#print(f"The the median of tumor volumes is: {quartiles[0.5]} ")

#lower_bound = lowerq - (1.5*iqr)
#upper_bound = upperq + (1.5*iqr)
#print(f"Values below {lower_bound} could be outliers.")
#print(f"Values above {upper_bound} could be outliers.")

    
    
    

The lower quartile of tumor volumes is: 32.37735684
The upper quartile of tumor volumes is: 40.1592203
The interquartile range of tumor volumess is: 7.781863460000004
The the median of tumor volumes is: 38.125164399999996 
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
#ax1.set_title('Reaction Times at Baseball Batting Cage')
#ax1.set_ylabel('Reaction Time (ms)')
ax1.boxplot(tumorVolData)
plt.show()


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
