## Pymaceuticals 

In [45]:
# Dependencies and Setup
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_df = pd.merge(mouse_metadata, study_results, how="inner", on="Mouse ID")
# Display the data table for preview
merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [46]:
# Checking the number of mice.
mice_count = merge_df["Mouse ID"].count()
mice_count

1893

In [47]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dup_mice = merge_df[merge_df.duplicated(["Mouse ID", "Timepoint"])]
dup_mice


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [48]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# clean_df = merge_df.drop_duplicates(["Mouse ID", "Timepoint"])
clean_df = merge_df[merge_df["Mouse ID"] != 'g989']
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [49]:
# Checking the number of mice in the clean DataFrame.
# 248 unique mice, 1880 is count of rows in the clean df.
mice_num = clean_df["Mouse ID"].unique()
mice_count = len(mice_num)
mice_count

248

## Summary Statistics

In [50]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean().map('{:.2f}'.format)
median = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median().map('{:.2f}'.format)
var = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var().map('{:.2f}'.format)
stddev = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std().map('{:.2f}'.format)
sem = clean_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem().map('{:.2f}'.format)

# Assemble the resulting series into a single summary dataframe.
summary_df = pd.DataFrame({"Mean": mean, "Median": median, "Variance": var, "Standard Deviation": stddev, "SEM": sem})
summary_df


Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.33
Ceftamin,52.59,51.78,39.29,6.27,0.47
Infubinol,52.88,51.82,43.13,6.57,0.49
Ketapril,55.24,53.7,68.55,8.28,0.6
Naftisol,54.33,52.51,66.17,8.13,0.6
Placebo,54.03,52.29,61.17,7.82,0.58
Propriva,52.32,50.45,43.85,6.62,0.54
Ramicane,40.22,40.67,23.49,4.85,0.32
Stelasyn,54.23,52.43,59.45,7.71,0.57
Zoniferol,53.24,51.82,48.53,6.97,0.52


In [31]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
# summary_df = summary_df.agg({"Drug Regimen": [mean, median, var, stddev, sem], "Tumor Volume (mm3)": [mean, median, var, stddev, sem]})
# summary_df

## Bar and Pie Charts

In [103]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
grouped_drug = clean_df.groupby("Drug Regimen")
drug_count = pd.DataFrame(grouped_drug["Drug Regimen"].count())
drug_bar = drug_count.plot(kind='bar', color="green", figsize=(7,4))
plt.title("Total # of Measurements for Each Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("# of Measurements")
plt.tight_layout()
plt.show()


<IPython.core.display.Javascript object>

In [139]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
x_axis = np.arange(len(drug_count))
tick_loc = [value for value in x_axis]
plt.figure(figsize=(7,4))
plt.bar(x_axis, drug_count["Drug Regimen"], color='g', alpha=1, align="center", width=0.5)
plt.xticks(tick_loc, list(drug_count.index), rotation='vertical')
plt.xlim(-0.5, len(x_axis)-0.5)
plt.ylim(0, max(drug_count["Drug Regimen"])+10)
plt.title("Total # of Measurements for Each Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("# of Measurements")
drug_reg = ["Drug Regimen"]
plt.legend(drug_reg, loc="best")
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [100]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_group = clean_df.groupby('Sex')
# gender_count = pd.DataFrame(gender_group['Mouse ID'].nunique())
gender_count = pd.DataFrame(gender_group['Sex'].count())
# gender_count.head()

gender_count.plot(kind='pie', figsize=(7,4), y='Sex', title="Distribution of Female versus Male Mice", 
                startangle=160, autopct='%1.1f%%', shadow=True, colors=["pink", "lightblue"], legend=False)

plt.axis('equal')
plt.show()

<IPython.core.display.Javascript object>

In [102]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_labels =  list(gender_count.index.values)
gender_values = gender_count['Sex']
colors = ["pink", "lightblue"]

plt.figure(figsize=(7,4))
plt.pie(gender_values, labels=gender_labels, colors=colors, startangle=160, autopct='%1.1f%%',
       shadow=True)

plt.title("Distribution of Female versus Male Mice")
plt.ylabel("Sex")
plt.axis('equal')
plt.show()


<IPython.core.display.Javascript object>

## Quartiles, Outliers and Boxplots

In [111]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# New sub-df with only the necessary drugs in it
four_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
four = clean_df[clean_df["Drug Regimen"].isin(four_list)]
# Group sub-df by Drug and Mouse ID then agg. 
# In new columm, place final tumor vol by using x.iloc[-1] lambda function from stack overflow
final_tum_vol = four.groupby(["Drug Regimen", "Mouse ID"]).agg(Final_Tumor_Vol=("Tumor Volume (mm3)", lambda x: x.iloc[-1]))
final_tum_vol

Unnamed: 0_level_0,Unnamed: 1_level_0,Final_Tumor_Vol
Drug Regimen,Mouse ID,Unnamed: 2_level_1
Capomulin,b128,38.982878
Capomulin,b742,38.939633
Capomulin,f966,30.485985
Capomulin,g288,37.074024
Capomulin,g316,40.159220
...,...,...
Ramicane,s508,30.276232
Ramicane,u196,40.667713
Ramicane,w678,43.166373
Ramicane,y449,44.183451


In [115]:
# Now that new df is made, put Drug Reg and Mouse ID back in as columns
final_df = final_tum_vol.stack(level=0).unstack(level=0)
#final_df.head()

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

# to set up drug IQR formatting
counter = 0

# begin for loop to loop through drugs and do IQR calcs
for x in four_list:
    quartiles = final_df[x].quantile([.25, .5, .75]).round(4)
    lowerq = quartiles[.25].round(4)
    upperq = quartiles[.75].round(4)
    iqr = round((upperq-lowerq),4)
    lower_bound = round(lowerq - (1.5*iqr),4)
    upper_bound = round(upperq + (1.5*iqr),4)
    
    # print data before for loop restarts and add an an extra space at the bottom
    if counter == 0:
        print("")
    print(f"The IQR data for {x} is:")
    print(f"The lower quartile of {x} is: {lowerq}")
    print(f"The upper quartile of {x} is: {upperq}")
    print(f"The interquartile range of {x} is: {iqr}")
    print(f"The the median of {x} is: {quartiles[0.5]}")
    print(f"Values below {lower_bound} could be outliers.")
    print(f"Values above {upper_bound} could be outliers.")
    print(f"")
    counter += 1



The IQR data for Capomulin is:
The lower quartile of Capomulin is: 32.3774
The upper quartile of Capomulin is: 40.1592
The interquartile range of Capomulin is: 7.7818
The the median of Capomulin is: 38.1252
Values below 20.7047 could be outliers.
Values above 51.8319 could be outliers.

The IQR data for Ramicane is:
The lower quartile of Ramicane is: 31.5605
The upper quartile of Ramicane is: 40.659
The interquartile range of Ramicane is: 9.0985
The the median of Ramicane is: 36.5617
Values below 17.9128 could be outliers.
Values above 54.3067 could be outliers.

The IQR data for Infubinol is:
The lower quartile of Infubinol is: 54.0486
The upper quartile of Infubinol is: 65.5257
The interquartile range of Infubinol is: 11.4771
The the median of Infubinol is: 60.1652
Values below 36.8329 could be outliers.
Values above 82.7414 could be outliers.

The IQR data for Ceftamin is:
The lower quartile of Ceftamin is: 48.7221
The upper quartile of Ceftamin is: 64.2998
The interquartile range 

In [123]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
# Empty list for box plot
box_list = []

# Enter new for loop to append the box plot list with tumor vols for each drug
# Drop NaN values so data is not skewed.
for x in four_list:
    box_list.append(list(final_df[x].dropna()))
    
# format the box plot and style it with color.

flierprops = dict(marker='o', markerfacecolor='y', markersize=5, markeredgecolor='black')

fig, ax1 = plt.subplots(figsize=(7,4))
ax1.set_title("Final Tumor Volume of Each Mouse Across 4 Regimens")
ax1.set_xlabel("Drug Regimens of Interest")
ax1.set_ylabel("Tumor Vol. (mm3)")
ax1.boxplot(box_list, notch=False, flierprops=flierprops)
plt.xticks([1,2,3,4], four_list)

plt.show()

<IPython.core.display.Javascript object>

## Line and Scatter Plots

In [131]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

# create new capomulin df and reset index to keep drug regimen column
capomulin_df = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"]
capomulin_df = capomulin_df.reset_index()

# make df for one mouse under the capomulin regimen
capomulin_mouse = capomulin_df.loc[capomulin_df["Mouse ID"] == "s185"]
capomulin_mouse

Unnamed: 0,index,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,10,s185,Capomulin,Female,3,17,0,45.0,0
1,11,s185,Capomulin,Female,3,17,5,43.878496,0
2,12,s185,Capomulin,Female,3,17,10,37.614948,0
3,13,s185,Capomulin,Female,3,17,15,38.177232,0
4,14,s185,Capomulin,Female,3,17,20,36.866876,0
5,15,s185,Capomulin,Female,3,17,25,33.94994,0
6,16,s185,Capomulin,Female,3,17,30,32.959671,1
7,17,s185,Capomulin,Female,3,17,35,28.328531,1
8,18,s185,Capomulin,Female,3,17,40,25.472143,1
9,19,s185,Capomulin,Female,3,17,45,23.343598,1


In [140]:
# grab tumor vol and timepoint columns to prepare for line plot (make sure to include all rows)
# reset index and drop so only 2 needed columns left for line plot
capomulin_mouse = capomulin_mouse.loc[:, ["Timepoint", "Tumor Volume (mm3)"]]
capomulin_mouse = capomulin_mouse.reset_index(drop=True)
capomulin_mouse

Unnamed: 0,Timepoint,Tumor Volume (mm3)
0,0,45.0
1,5,43.878496
2,10,37.614948
3,15,38.177232
4,20,36.866876
5,25,33.94994
6,30,32.959671
7,35,28.328531
8,40,25.472143
9,45,23.343598


In [169]:
# construct the line plot
x_ax = np.arange(len(capomulin_mouse["Timepoint"]))
tic_locs = [value for value in x_ax]
plt.figure()
plt.plot(x_ax, capomulin_mouse["Tumor Volume (mm3)"], marker="o", color="b", label="Tumor Volume (mm3)")
plt.xticks(tic_locs, list(capomulin_mouse["Timepoint"]))
plt.title("Time vs. Tumor Volume for One Mouse Treated with Capomulin")
plt.xlabel("Time Points")
plt.ylabel("Tumor Volume (mm3)")
plt.xlim(-0.5, len(x_ax)-0.5)
plt.ylim(15, max(capomulin_mouse["Tumor Volume (mm3)"])+2)
plt.legend(loc="best")
plt.show()

<IPython.core.display.Javascript object>

In [204]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

# new df for mouse weight
mouse_weight = capomulin_df.loc[:, ["Mouse ID", "Weight (g)", "Tumor Volume (mm3)"]]
mouse_weight.head()

# group by mouse ID and calculate mean tumor vol 
avg_tum_vol = mouse_weight.groupby(["Mouse ID", "Weight (g)"])
avg_tum_vol = avg_tum_vol["Tumor Volume (mm3)"].mean()
avg_tum_vol = pd.DataFrame(avg_tum_vol)

# reset index so Mouse ID and Weight revert back to columns
avg_tum_vol = avg_tum_vol.reset_index()

# rename Tumor Volume column to avg tumor volume to avoid further confusion in next section

avg_tum_vol = avg_tum_vol.rename(columns={"Tumor Volume (mm3)": "Average Tumor Vol. (mm3)"})
avg_tum_vol.head()

# prepare scatter plot
plt.figure(figsize=(7,5))
plt.scatter(avg_tum_vol["Weight (g)"], avg_tum_vol["Average Tumor Vol. (mm3)"],
            marker="o", facecolors="c", edgecolors="k")
plt.title("Avg. Tumor Volume vs. Mouse Weight for Capomulin Regimen")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.xlim(14,26)
plt.ylim(34,47)
mice_id = ["Mice in Capomulin Regimen"]
plt.legend(mice_id, loc="upper left")
plt.show()

<IPython.core.display.Javascript object>

## Correlation and Regression

In [216]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# use st.pearsonr to find correlation
correlation_coeff = st.pearsonr(avg_tum_vol["Weight (g)"], avg_tum_vol["Average Tumor Vol. (mm3)"])
# print(correlation_coeff)

# set up x values and y values for lin regress line (x=weight, y=avg tum vol from avg_tum_vol df)
mw = avg_tum_vol["Weight (g)"]
atv = avg_tum_vol["Average Tumor Vol. (mm3)"]
(atv_slope, atv_int, atv_r, atv_p, atv_std_err) = linregress(mw, atv)
linregression = atv_slope * mw + atv_int

# Plot the linear model on the scatter plot (place behind points for clean formatting)
m_w = avg_tum_vol.iloc[:,1]
a_t_v = avg_tum_vol.iloc[:,2]
plt.figure(figsize=(7,5))
plt.scatter(m_w,a_t_v, marker="o", facecolors="c", edgecolors="k")
plt.plot(m_w,linregression,"y-", lw="1", zorder=0, label="Linear Regression Model")
plt.title("Avg. Tumor Volume vs. Mouse Weight for Capomulin Regimen")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.xlim(14,26)
plt.ylim(34,47)

# create legend for linear regression model
plt.legend(loc="upper left")
plt.show()

<IPython.core.display.Javascript object>