In [None]:
#Observations and Insights
#Based on observation mouse weight is correlated with tumor size, and as mouse weight increases so does the size of the tumor. 
#Capumolin appears to be effective in reducing the size of tumors. 
#Capumolin and Ramicane appear to be similarly effective, and better choices than Infubinol and Ceftamin for treating tumors based on avg tumor size. 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
study_data_complete = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

# Display the data table for preview
study_data_complete

In [None]:
mouse_count = len(study_data_complete["Mouse ID"].unique())
mouse_count

In [None]:
mouse_count_df = study_data_complete[['Mouse ID', 'Timepoint']].copy()
mouse_count_df = mouse_count_df.set_index('Mouse ID')
mouse_count_df = mouse_count_df.groupby('Mouse ID').count()
mouse_count_df.sort_values(by=['Timepoint'], ascending=False, inplace=True)
mouse_count_df.head()

In [None]:
study_data_complete.loc[study_data_complete.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]

In [None]:
study_data_complete = study_data_complete.drop_duplicates(subset=['Mouse ID', 'Timepoint'], keep=False)
study_data_complete

In [None]:
#Summary Statistics
study_data_complete_summary = study_data_complete.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': [np.mean,np.median,np.var,np.std,st.sem]})
study_data_complete_summary.columns = ['Mean Tumor Volume','Median Tumor Volume', 'Tumor Volume Variance', 'Tumor Volume Std Dev','Tumor Volume SEM']
study_data_complete_summary

In [None]:
drug_table = study_data_complete[['Drug Regimen', 'Mouse ID']].copy()
drug_table = drug_table.drop_duplicates(subset=['Drug Regimen', 'Mouse ID'], keep= 'first')

In [None]:
#drug_table = drug_table.set_index('Drug Regimen')
drug_table.groupby('Drug Regimen').agg({'Mouse ID': ['count']})
drug_table

In [None]:
drug_table = drug_table.groupby('Drug Regimen', as_index = False).agg({'Mouse ID': ['count']})
drug_table.columns = ['Drug Regimen', 'Mouse Count']
#drug_table.columns = drug_table.columns.droplevel(1)
drug_table

In [None]:
maxaxis = drug_table['Mouse Count'].max()
topax = maxaxis + 5

In [None]:
#Bar Plots: pandas.DataFrame.plot()
drug_table.plot(kind = 'bar', y='Mouse Count', x='Drug Regimen', title = 'Mice Count By Drug Regimen').axes.set_ylim(0,maxaxis+5)
plt.show()

In [None]:
#Bar Plots: matplotlib.pyplot()
plt.bar(drug_table['Drug Regimen'], drug_table['Mouse Count'])
plt.title('Mice Count By Drug Regimen')
plt.xticks(rotation = 90)
plt.ylim(top=topax)
plt.show()

In [None]:
mouse_gender = study_data_complete[['Mouse ID', 'Sex']].copy()
mouse_gender = mouse_gender.drop_duplicates()
mouse_gender = mouse_gender.groupby('Sex').agg({'Mouse ID': ['count']})
mouse_gender.columns = ['Mouse Count']
mouse_gender.sort_values(by=['Mouse Count'], ascending=False, inplace=True)
#mouse_gender.insert(2,"Percentage", mouse_gender['Mouse Count']/mouse_count)
mouse_gender

In [None]:
#Pie Plots: pandas.DataFrame.plot()
colors = ['#1c72e9','#ffe4e1']
mouse_gender.plot( autopct='%1.1f%%', kind = 'pie', y = 'Mouse Count', colors = colors, legend=False, title='Mouse Distribution by Gender').set_ylabel('Sex')
plt.show()

In [None]:
#Pie Plots: matplotlib.pyplot()
plt.pie(mouse_gender['Mouse Count'],autopct="%1.1f%%", shadow=True, labels = mouse_gender.index, colors=colors)
plt.title('Mouse Distribution by Gender')
plt.ylabel('Sex')
plt.show()

In [None]:
MaxTime = study_data_complete[['Mouse ID', 'Timepoint']].copy()
MaxTime = MaxTime.groupby('Mouse ID').agg({'Timepoint': ['max']})
MaxTime.columns = ['Timepoint']
MaxTime

In [None]:
study_data_complete_end = pd.merge(MaxTime, study_data_complete,  how="left", on=["Mouse ID", 'Timepoint'])
study_data_complete_end

In [None]:
successful_drugs = ('Capomulin', 'Ramicane', 'Infubinol','Ceftamin')
capomulin = ['Capomulin']
print(successful_drugs)

In [None]:
study_data_complete_end = study_data_complete_end[study_data_complete_end['Drug Regimen'].isin(successful_drugs)]
study_data_complete_end = study_data_complete_end.sort_values(by=['Timepoint'], ascending=True)
study_data_complete_end

In [None]:
tumors = study_data_complete_end['Tumor Volume (mm3)']

In [None]:
#Quartiles, IQR and Outliers
quartiles = tumors.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

In [None]:
#Quartiles, IQR and Outliers
print(f"The lower quartile of Tumor Volume is: {lowerq}")
print(f"The upper quartile of Tumor Volume is: {upperq}")
print(f"The interquartile range of Tumor Volume is: {iqr}")
print(f"The the median of Tumor Volume is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
Ceftamine_tumor_volume = []
Capomulin_tumor_volume = []
Infubinol_tumor_volume = []
Ramicane_tumor_volume = []

In [None]:
Ceftamine_tumor_volume = study_data_complete_end.loc[study_data_complete_end['Drug Regimen'] == 'Ceftamin', 'Tumor Volume (mm3)']
Capomulin_tumor_volume = study_data_complete_end.loc[study_data_complete_end['Drug Regimen'] == 'Capomulin', 'Tumor Volume (mm3)']
Infubinol_tumor_volume = study_data_complete_end.loc[study_data_complete_end['Drug Regimen'] == 'Infubinol', 'Tumor Volume (mm3)']
Ramicane_tumor_volume = study_data_complete_end.loc[study_data_complete_end['Drug Regimen'] == 'Ramicane', 'Tumor Volume (mm3)']
plot_list = [Capomulin_tumor_volume,Ramicane_tumor_volume,Infubinol_tumor_volume, Ceftamine_tumor_volume]

In [None]:
#Box Plots
fig, ax = plt.subplots()
ax.set_title('Tumor Volume for Drug Regimen')
green_diamond = dict(markerfacecolor='#800080', marker='s')
ax.boxplot(plot_list, flierprops=green_diamond)
ax.set_xticklabels(successful_drugs)
plt.show()

In [None]:
individual_mouse = ['s185']
individual_mouse_study = study_data_complete[study_data_complete['Mouse ID'].isin(individual_mouse)]
individual_mouse_study = individual_mouse_study.set_index('Mouse ID')
individual_mouse_study

In [None]:
#Line Plots
plt.plot(individual_mouse_study.iloc[:,0],individual_mouse_study.iloc[:,1])
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Capumolin Results for Mouse s185')
plt.show()

In [None]:
study_data_complete_cap = study_data_complete[['Mouse ID', 'Drug Regimen','Weight (g)', 'Tumor Volume (mm3)']].copy()
study_data_complete_cap = study_data_complete_cap[study_data_complete_cap['Drug Regimen'].isin(capomulin)]
study_data_complete_cap = study_data_complete_cap.groupby(['Mouse ID', 'Drug Regimen'], as_index = False).agg({'Weight (g)': np.mean, 'Tumor Volume (mm3)' : np.mean})
study_data_complete_cap

In [None]:
weight_scatter = study_data_complete_cap.iloc[:,2]
tumor_scatter = study_data_complete_cap.iloc[:,3]

In [None]:
#Scatter Plots
plt.scatter(weight_scatter,tumor_scatter)
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.title('Tumor Volume Average vs Mouse Weight')
plt.show()

In [None]:
#Correlation and Regression
correlation = st.pearsonr(weight_scatter,tumor_scatter)
print(f"The correlation between both factors is {round(correlation[0],2)}")

In [None]:
#Correlation and Regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(weight_scatter, tumor_scatter)
regress_values = weight_scatter * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(weight_scatter,tumor_scatter)
plt.plot(weight_scatter,regress_values,"r-")
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Tumor Volume (mm3) Avg')
plt.title('Tumor Volume Average vs Mouse Weight w/ Regression')
plt.show()