## Observations and Insights 

In [64]:
# Dependencies and Setup
%matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

pd.options.display.precision = 2

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [65]:
# Combine the data into a single dataset
combine_df = pd.merge(mouse_metadata,study_results,on="Mouse ID", how = "right")


In [66]:
# Display the data table for preview
combine_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.83,0
2,k403,Ramicane,Male,21,16,10,35.01,1
3,k403,Ramicane,Male,21,16,15,34.22,1
4,k403,Ramicane,Male,21,16,20,33.0,1


In [67]:
# Checking the number of mice.
len(combine_df["Mouse ID"].unique())

249

In [68]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_df = combine_df.duplicated(subset=["Mouse ID", "Timepoint"],keep=False)
duplicate_index = duplicate_df.index[duplicate_df]


In [69]:
# Optional: Get all the data for the duplicate mouse ID. 
combine_df.loc[duplicate_index,:]["Mouse ID"].unique()


array(['g989'], dtype=object)

In [70]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = combine_df[combine_df["Mouse ID"]!= "g989"]


In [71]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].unique())

248

## Summary Statistics

In [72]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_regimen_list = clean_df["Drug Regimen"].unique()

summary_table = {"Regimen":drug_regimen_list,"Mean":[],"Median":[],"SD":[],"SEM":[]}

for each in drug_regimen_list:
    summary_table["Mean"].append(np.mean(clean_df.loc[clean_df["Drug Regimen"] == each,"Tumor Volume (mm3)"]))
    summary_table["Median"].append(np.median(clean_df.loc[clean_df["Drug Regimen"] == each,"Tumor Volume (mm3)"]))
    summary_table["SD"].append(np.std(clean_df.loc[clean_df["Drug Regimen"] == each,"Tumor Volume (mm3)"],ddof = 0))
    summary_table["SEM"].append(st.sem(clean_df.loc[clean_df["Drug Regimen"] == each,"Tumor Volume (mm3)"]))

    
summary_df = pd.DataFrame.from_dict(summary_table)

summary_df.head()


Unnamed: 0,Regimen,Mean,Median,SD,SEM
0,Ramicane,40.22,40.67,4.84,0.32
1,Capomulin,40.68,41.56,4.98,0.33
2,Infubinol,52.88,51.82,6.55,0.49
3,Placebo,54.03,52.29,7.8,0.58
4,Ceftamin,52.59,51.78,6.25,0.47


## Bar and Pie Charts

In [73]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
study_mice_df = clean_df[["Mouse ID","Drug Regimen","Sex"]].drop_duplicates(subset="Mouse ID")

fig1, ax1 = plt.subplots()
ax1 = study_mice_df.groupby("Drug Regimen").count()["Mouse ID"].plot(kind="bar", title = "Total Number of Mice for each Treatment(pandas)",xlim=(-0.25,len(drug_regimen_list)+0.25))
ax1.set_ylabel("Number")
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [74]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_list = study_mice_df.groupby("Drug Regimen").count().index
x_range = np.arange(len(x_list))
y_total =  study_mice_df.groupby("Drug Regimen").count()["Mouse ID"]

fig2, ax2 =plt.subplots()
ax2.bar(x_range,y_total, width = 0.5)
ax2.set_xticks(x_range)
ax2.set_xticklabels(x_list, rotation=90)
ax2.set_xlabel("Drug Regimen")
ax2.set_ylabel("Number")
ax2.set_xlim(-0.75,len(drug_regimen_list)-0.25)
ax2.set_title("Total Number of Mice for each Treatment(pyplot)")
plt.tight_layout()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [75]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

ax3 = study_mice_df.groupby("Sex").count().plot(y = "Mouse ID",kind = "pie", startangle = 90, autopct='%1.1f%%', title="Distribution of female vs male" )
ax3.set_ylabel("")
plt.tight_layout()
#members = [49, 92, 84, 53]
#x_axis = np.arange(0, len(gyms))
#colors = ["yellowgreen", "red", "lightcoral", "lightskyblue"]
#explode = (0, 0.05, 0, 0)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [76]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex = ["Female","Male"]
sex_numbers = [study_mice_df[study_mice_df["Sex"] == "Female"].count()["Mouse ID"],study_mice_df[study_mice_df["Sex"] == "Male"].count()["Mouse ID"]]
colors = ["blue","orange"]
plt.figure()
plt.pie(sex_numbers,colors=colors, startangle = 90, autopct='%1.1f%%',labels=sex)
plt.legend()
plt.title("Distribution of female vs male")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Distribution of female vs male')

## Quartiles, Outliers and Boxplots

In [77]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  

# Capomulin, Ramicane, Infubinol, and Ceftamin
treatment_list = clean_df["Drug Regimen"].unique()
max_timepoint = {}
treatment_df_list = {}

for treatment in treatment_list:
    treatment_df_list[treatment]  = clean_df[clean_df["Drug Regimen"] == treatment]
    for mice in treatment_df_list[treatment]["Mouse ID"].unique():
        max_timepoint[mice] = treatment_df_list[treatment][treatment_df_list[treatment]["Mouse ID"] == mice].max()["Timepoint"]

max_timepoint
    

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
volume_dict ={"Mouse ID":list(max_timepoint.keys()),"Last Volume":[]}
for key in max_timepoint.keys():
    volume_dict["Last Volume"].append(clean_df.loc[(clean_df["Mouse ID"] == key) & (clean_df["Timepoint"] == max_timepoint[key]),"Tumor Volume (mm3)"].values[0])
    
volume_df = pd.DataFrame.from_dict(volume_dict)

clean_last_volume_df = pd.merge(clean_df,volume_df,on="Mouse ID",how ="left")
last_volume_df  = clean_last_volume_df[["Mouse ID","Drug Regimen","Last Volume"]].drop_duplicates()

In [78]:

# Put treatments into a list for for loop (and later for plot labels)

treatment_list = ["Capomulin","Ramicane","Infubinol","Ceftamin"]
# Create empty list to fill with tumor vol data (for plotting)
Capomulin_volume = last_volume_df.loc[last_volume_df["Drug Regimen"]=="Capomulin","Last Volume"]
Ramicane_volume = last_volume_df.loc[last_volume_df["Drug Regimen"]=="Ramicane","Last Volume"]
Infubinol_volume = last_volume_df.loc[last_volume_df["Drug Regimen"]=="Infubinol","Last Volume"]
Ceftamin_volume =  last_volume_df.loc[last_volume_df["Drug Regimen"]=="Ceftamin","Last Volume"]

four_data = [Capomulin_volume,Ramicane_volume,Infubinol_volume,Ceftamin_volume]

lower_quartile = []
higher_quartile = []
IQR = []
outlier_series = []
# Calculate the IQR and quantitatively determine if there are any potential outliers. 

for each_df in four_data: 
    quartiles = each_df.quantile([0.25,0.5,0.75])
    lower_quartile.append(quartiles[0.25])
    higher_quartile.append(quartiles[0.75])
    IQR.append(quartiles[0.75] - quartiles[0.25])
    outlier_series.append(each_df[(each_df <quartiles[0.25] -1.5*(quartiles[0.75] - quartiles[0.25]))|(each_df >quartiles[0.75] +1.5*(quartiles[0.75] - quartiles[0.25]))])
    


print(outlier_series)
print('--------------------')
print('Infubinol has an outlier of 36.32')



[Series([], Name: Last Volume, dtype: float64), Series([], Name: Last Volume, dtype: float64), 668    36.32
Name: Last Volume, dtype: float64, Series([], Name: Last Volume, dtype: float64)]
--------------------
Infubinol has an outlier of 36.32


In [84]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig3, ax3 = plt.subplots()
ax3.boxplot(four_data)
ax3.set_title("Final tumor volume of each mouse across four regimens of interest")
ax3.set_xticklabels(treatment_list)
ax3.set_xlabel("Regimen")
ax3.set_ylabel("Final Volume(mm3)")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Line and Scatter Plots

In [113]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin",["Mouse ID","Timepoint","Tumor Volume (mm3)"]]
capomulin_df.set_index("Timepoint", inplace = True)
ax5 = capomulin_df.plot(subplots=True,figsize=(12,12),style = '^-',title ="Movement of tumor volume as per time point")
plt.ylabel("Tumor Volume (mm3)")
plt.tight_layout()

  fig = plt.figure(**fig_kw)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [124]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

weight_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin",["Mouse ID","Tumor Volume (mm3)","Weight (g)"]]

fig6, ax6 = plt.subplots()

x=weight_df.groupby("Mouse ID").mean()["Weight (g)"]
y=weight_df.groupby("Mouse ID").mean()["Tumor Volume (mm3)"]
ax6.scatter(x=x, y=y)
(slope, intercept,r_value,p_value,std_err) = st.linregress(x,y)
         
ax6.plot(x,slope*x +intercept)
ax6.text(17,slope*17+intercept-1,'y='+str(round(slope,4)) + 'x +' +str(round(intercept,4)) + "     R2 =" +str(round(r_value*r_value,4)))
ax6.set_title("Relation of mouse weight and average tumor volume")
ax6.set_xlabel("Weight (g)")
ax6.set_ylabel("Average Tumor Volume(mm3)")
plt.tight_layout()

  """


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Correlation and Regression

In [128]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
weight_df.groupby("Mouse ID").mean().corr()

print("Correlation = 0.84")
weight_df.groupby("Mouse ID").mean().corr()


Correlation = 0.84


Unnamed: 0,Tumor Volume (mm3),Weight (g)
Tumor Volume (mm3),1.0,0.84
Weight (g),0.84,1.0
