## Observations and Insights

## Dependencies and starter code

In [4]:
# OBSERVATIONS AND INFERENCES
print("THESE MIGHT CHANGE WITH NEW APPROACH 1)  The scatterplot and regression of weight vs. avg. tumor volume across the top 4 drug regimens demonstrate that average tumor volume increases as the weight of the mouse increases.")
print(" 2)  The line plot for a single mouse in the Capomulin regimen shows a steady decrease in the tumor volume as the regimen progressed (45 days).")
print(" 3)  From the statistical output Capomulin and Ramicane appear to be similarly successful in reducing tumor volume.  Not only do they have the lowest values for mean tumor volume but they also have low standard deviations.  They also have similar median average tumor volumes as well as IQR and lower and upper bounds - especially as compared to the other top performing drugs.")  
print(" 4)  Some mice didn't make it through the entire drug regimen (perhaps they died) this affects final tumor volume data as the effects of the drug regimen on these mice can't totally be understood." )

THESE MIGHT CHANGE WITH NEW APPROACH 1)  The scatterplot and regression of weight vs. avg. tumor volume across the top 4 drug regimens demonstrate that average tumor volume increases as the weight of the mouse increases.
 2)  The line plot for a single mouse in the Capomulin regimen shows a steady decrease in the tumor volume as the regimen progressed (45 days).
 3)  From the statistical output Capomulin and Ramicane appear to be similarly successful in reducing tumor volume.  Not only do they have the lowest values for mean tumor volume but they also have low standard deviations.  They also have similar median average tumor volumes as well as IQR and lower and upper bounds - especially as compared to the other top performing drugs.
 4)  Some mice didn't make it through the entire drug regimen (perhaps they died) this affects final tumor volume data as the effects of the drug regimen on these mice can't totally be understood.


In [5]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem
from scipy.stats import linregress

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
mouse_merge = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID", "Mouse ID"])
mouse_merge.head()

FileNotFoundError: [Errno 2] File b'data/Mouse_metadata.csv' does not exist: b'data/Mouse_metadata.csv'

In [None]:
# look at all drug names
drug_regimens = mouse_merge['Drug Regimen'].unique()
drug_regimens

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each drug regimen
regimen_df = mouse_merge.set_index('Drug Regimen')
regimen_gpd = regimen_df.groupby('Drug Regimen')

drug_stats = regimen_gpd['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', 'sem'])
drug_stats

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
count_per_drug = mouse_merge['Drug Regimen'].value_counts()
count_df = pd.DataFrame(count_per_drug)
count_df.plot(kind="bar", figsize=(6,4), width =.75)
plt.title("Number of Data Points per Drug Regimen")
plt.xlabel("Drug")
plt.ylabel("Data Points")

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
index_count = pd.Index(regimen_df.index).value_counts()
index_count
x_axis = np.arange(len(drug_regimens))
plt.bar(x_axis, index_count, color='r', alpha=0.5, align="center")
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drug_regimens,rotation="vertical")
plt.title("Number of Data Points per Drug Regimen")
plt.xlabel("Drug")
plt.ylabel("Data Points")

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_df = mouse_merge[['Drug Regimen','Sex']]
gender_df = gender_df.set_index('Drug Regimen')
gender = gender_df['Sex'].unique()

male_female = (gender_df['Sex'].value_counts(dropna=False))
male_female.plot(y=male_female,kind="pie",autopct="%1.1f%%")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

colors = ["red", "lightskyblue"]
plt.pie(male_female, labels = gender, colors=colors, autopct="%1.1f%%")

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers.
# find tumor volume at MAX Timepoint
# drop unnecessary drug regimens
trim_df = regimen_df.drop(['Placebo','Stelasyn','Zoniferol','Ketapril','Propriva','Naftisol'])
#print(trim_df)
top_drugs = trim_df.reset_index()
#print(top_drugs)

last_tumor = top_drugs.groupby(['Mouse ID']).agg({
        'Drug Regimen': "first",
        'Timepoint':max,
        'Tumor Volume (mm3)': "last"
            })
last_tumor.head()

In [None]:
# group top drugs data
top_drugs_gpd = last_tumor.groupby('Drug Regimen')

In [None]:
# determine quartiles and IQR for final tumor volume for each drug
quartiles = []
lowerq = []
upperq = []
iqr = []
lower_bound = []
upper_bound = []
drugs = []

for drug, group in top_drugs_gpd['Tumor Volume (mm3)']:
    drug = drug
    quarts = group.quantile([.25,.5,.75])
    lq = quarts[0.25]
    uq = quarts[0.75]
    iqr_tum = uq-lq
    lb= lq- (1.5*iqr_tum)
    ub = uq + (1.5*iqr_tum)
    quartiles.append(quarts)
    lowerq.append(lq)
    upperq.append(uq)
    iqr.append(iqr_tum)
    lower_bound.append(lb)
    upper_bound.append(ub)
    drugs.append(drug)
    
quart_stats = {"Drug Regimen":drugs,"Lower Quartile":lowerq, "Upper Quartile":upperq, "IQR":iqr, "Lower Bound":lower_bound,"Upper Bound":upper_bound}
quarts_df = pd.DataFrame(quart_stats)
quarts_df = quarts_df.set_index('Drug Regimen')
quarts_df.head()

In [None]:
# merge last_tumor with quartile statistics
top_drugs = last_tumor.reset_index()
print(top_drugs)
outliers_df = pd.merge(top_drugs, quarts_df, on = "Drug Regimen")
outliers_df_gpd = outliers_df.groupby('Drug Regimen')
outliers_df

In [None]:
# find how many outliers and which one(s)
for drug in outliers_df:
    outliers = outliers_df.loc[(outliers_df["Tumor Volume (mm3)"] < outliers_df['Lower Bound'] ) | (outliers_df["Tumor Volume (mm3)"] > outliers_df['Upper Bound'])]
total_outliers = outliers["Mouse ID"].count()
print(f"There are a total of {total_outliers} outliers.")
outliers

In [None]:
# collect Avg. Tumor Value for each Drug Regimen in variables
CapoVol = top_drugs_gpd.get_group('Capomulin')['Tumor Volume (mm3)']
RamiVol = top_drugs_gpd.get_group('Ramicane')['Tumor Volume (mm3)']
InfuVol = top_drugs_gpd.get_group('Infubinol')['Tumor Volume (mm3)']
CeftVol = top_drugs_gpd.get_group('Ceftamin')['Tumor Volume (mm3)']

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
data = [CapoVol, CeftVol, InfuVol, RamiVol]
fig2, ax2 = plt.subplots()
ax2.set_title('Multiple Drugs and Final Tumor Volume Data')
plt.ylabel("Tumor Volume (mm3)")
plt.xticks([1, 2, 3, 4], ['Capomulin', 'Ceftamin', 'Infubinol','Ramicane'])
box = ax2.boxplot(data,0,'gD', patch_artist=True)
 
colors = ['blue', 'green', 'purple', 'tan', 'pink', 'red']
 
for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)

plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
Capomulin_df = mouse_merge.loc[mouse_merge['Drug Regimen'] == "Capomulin"]

x_axis = Capomulin_df['Timepoint'].iloc[0:10]
y_axis = Capomulin_df['Tumor Volume (mm3)'].iloc[0:10]
plt.plot(x_axis, y_axis)
plt.title("Tumor Volume for One Mouse Treated with a Capomulin Regimen")
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.xticks(Capomulin_df['Timepoint'])
plt.xlim(0,50)
plt.ylim(20,55)
plt.show()

In [None]:
# find average tumor volume for each mouse in Capomulin regimen
Capomulin_df.set_index('Mouse ID')
Capo_avgs = Capomulin_df.groupby(['Mouse ID']).mean()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
weight = Capo_avgs['Weight (g)']
tum_vol = Capo_avgs['Tumor Volume (mm3)']
plt.scatter(weight, tum_vol, marker="o", facecolors="pink", edgecolors="black", alpha=0.75)
plt.title("Mouse Weight vs. Avg. Tumor Volume for the Capomulin Regimen")
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Avg. Tumor Volume (mm3)")
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(weight,tum_vol)
print(f"The correlation between mouse weight and tumor volume for the Capomulin Regimen is {round(correlation[0],2)}")

In [None]:
plt.scatter(weight, tum_vol, marker="o", facecolors="pink", edgecolors="black", alpha=0.75)
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Avg. Tumor Volume (mm3)")

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(weight, tum_vol)
regress_values = weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.plot(weight,regress_values,"r-")
plt.annotate(line_eq,(20,38),fontsize=15,color="red")
plt.title("Mouse Weight vs. Avg. Tumor Volume for the Capomulin Regimen")
plt.show()