## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
# Combine the data into a single dataset
complete_data = pd.merge(study_results, mouse_metadata, on='Mouse ID')

# Display the data table for preview
complete_data.describe()

In [None]:
# Checking the number of mice.
mouseIDs = complete_data['Mouse ID'].unique()
mouseCount = len(mouseIDs)

In [None]:
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dup_df = study_results[study_results.duplicated(subset=['Mouse ID','Timepoint'], keep=False)]

dupIDs = pd.unique(dup_df['Mouse ID'])
dupIDs


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 

#x=complete_data.loc(['Mouse ID'])
for i in dupIDs:
    x  = complete_data[complete_data['Mouse ID'] == i]
x

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
for ID in dupIDs:
    clean_data = complete_data[complete_data['Mouse ID'] != ID]


In [None]:
# Checking the number of data points in the clean DataFrame.
dataTotal = len(clean_data)

# Checking the number of mice in the clean DataFrame.
mouseIDs = clean_data['Mouse ID'].unique()
mouseCount = len(mouseIDs)
mouseCount

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
tumorSummary ={}

tumorVolMeanByDrug = clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
tumorVolMedianByDrug = clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
tumorVolVarByDrug = clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
tumorVolSDByDrug = clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
tumorVolSEMByDrug = clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

In [None]:
tumorSummary = pd.DataFrame(tumorVolMeanByDrug)
tumorSummary = pd.merge(tumorSummary, tumorVolMedianByDrug, on='Drug Regimen')
tumorSummary = tumorSummary.rename(columns={'Tumor Volume (mm3)_x':'Mean Tumor Volume',
                                            'Tumor Volume (mm3)_y':'Median Tumor Volume'})

In [None]:
tumorSummary = pd.merge(tumorSummary, tumorVolVarByDrug, on='Drug Regimen')
tumorSummary = pd.merge(tumorSummary, tumorVolSDByDrug, on='Drug Regimen')
tumorSummary = pd.merge(tumorSummary, tumorVolSEMByDrug, on='Drug Regimen')
tumorSummary = tumorSummary.rename(columns={'Tumor Volume (mm3)_x':'Variance of Tumor Volume',
                                            'Tumor Volume (mm3)_y':'Std Dev Tumor Volume',
                                            'Tumor Volume (mm3)':'Std Err Tumor Volume'})
tumorSummary

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
tumorSumm = clean_data.groupby('Drug Regimen')['Tumor Volume (mm3)'].agg(['mean','median','var','std','sem'])
tumorSumm

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
totalTmPt = clean_data.groupby('Drug Regimen')['Timepoint'].count()
totalTmPt
tpPandas = totalTmPt.plot.bar(x='Drug Regimen', y='Timepoint', rot=45)
plt.ylabel('Total Number of Measurements')

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.

tpPyplot = totalTmPt.plot(kind='bar', title='Total Number of Measurements in Each Drug Regimen')
plt.ylabel('Total Number of Measurements')

## Pie Chart

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

# Delete mice with duplicate IDs from mouse data
for ID in dupIDs:
    clean_MseData = mouse_metadata[mouse_metadata['Mouse ID'] != ID]

# group by 'Sex', count for each sex
groupBySex = clean_MseData.groupby('Sex')['Mouse ID'].count()

sxPd = groupBySex.plot(kind="pie", autopct='%1.2f%%')

In [None]:
groupBySex

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ['Female', 'Male']
sxPlt = plt.pie(groupBySex, autopct='%1.2f%%', labels=labels)
plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
lastTimePts = clean_data.groupby('Mouse ID')['Timepoint'].max()
lastTimePts

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
finalTumorVol = pd.merge(lastTimePts, clean_data, on=['Mouse ID', 'Timepoint'])
finalTumorVol

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
drugs = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

In [None]:
# # Create empty lists to fill with tumor vol data (for plotting)
tumorVolData = []

quartiles=[]
lowerQ=[]
upperQ=[]
iqrs=[]
outlierPts=[]

# loop through each drug to pull only mice and their volumes with specified treatments *output: (list of lists)
for tx in drugs:
    
    #tumor volumes for each tx drug regimen
    finalVol = finalTumorVol.loc[finalTumorVol['Drug Regimen'] == tx, 'Tumor Volume (mm3)']
    
    # find upper, lower quartiles, IQR
    quartiles = finalVol.quantile([.25,.5,.75])
    
    LQ = quartiles[.25]
    lowerQ.append(LQ)
    
    UQ = quartiles[.75]
    upperQ.append(UQ)
    
    IQR = (UQ - LQ)
    iqrs.append(IQR) 
    
    # Determine outliers using upper and lower bounds
    lower_bound = LQ - (1.5*IQR)
    upper_bound = UQ + (1.5*IQR)
    
    outlierU = finalVol.loc[finalVol >= upper_bound]# or finalVol <= lower_bound]
    outlierL = finalVol.loc[finalVol <= lower_bound]
    outlierPts.append(outlierU)
    outlierPts.append(outlierL)
    
    # append to tumorVolData list that contains all volumes for all drugs
    tumorVolData.append(finalVol)
   
    # print
    if outlierU.empty and outlierL.empty:
        print(f'There are no potential outliers in tumor volume data for {tx}. ')
    elif outlierU.empty:
        print(f'Potential outliers in the tumor volume data for {tx} are {outlierL} ')
    elif outlierL.empty:
        print(f'Potential outliers in the tumor volume data for {tx} are {outlierU} ')


## Box plot

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Final Tumor Volume (mm3)')
ax1.set_xlabel('Drug Regimen')
ax1.set_ylabel('Tumor Volume (mm3)')

# change look of outlier symbols
blueDiam = dict(markerfacecolor='b', marker='D')

ax1.boxplot(tumorVolData, flierprops=blueDiam)
ax1.set_xticklabels(drugs)

plt.show()



## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
m_ID = 's185'

# Pull all data for mouse example
mouseEx = clean_data.loc[clean_data['Mouse ID'] == m_ID]
mouseEx

# Line plot
fig2, ax2 = plt.subplots()
ax2.plot(mouseEx['Timepoint'], mouseEx['Tumor Volume (mm3)'])
ax2.set(xlabel='Timepoint', ylabel='Tumor Volume (mm3)',
       title=f'Tumor volume over time for mouse {m_ID} on Capomulin')
plt.show()

## Scatter Plot

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
drugEx = 'Capomulin'

# Pull all data for drug example Capomulin
drugEx_df = clean_data.loc[clean_data['Drug Regimen'] == drugEx]
drugEx_df

tumorVolMeans = drugEx_df.groupby('Mouse ID').mean()
tumorVolMeans

y_vals = tumorVolMeans['Tumor Volume (mm3)']
x_vals = tumorVolMeans['Weight (g)']

# Plot scatter plot
#fig4, ax4 = plt.subplots()
plt.scatter(x_vals, y_vals)
plt.xlabel('Weight (g)')
plt.ylabel('Mean Tumor Volume')
plt.title (f'Mean Tumor Volume vs Mouse Weight (g) on {drugEx}')
# ax4.set(xlabel='Weight (g)', ylabel='Mean Tumor Volume',
#         title=f'Mean Tumor Volume vs Mouse Weight (g) on {drugEx}')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen


In [None]:
# Print out the r-squared value along with the plot.

(slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_vals, y_vals)
regress_values = (slope * x_vals) + intercept

eqn = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_vals, y_vals)
plt.plot(x_vals,regress_values,"r-")

plt.annotate(eqn,(16,44),fontsize=15,color="red")
plt.xlabel('Weight (g)')
plt.ylabel('Mean Tumor Volume mm3')
plt.title (f'Mean Tumor Volume vs Mouse Weight (g) on {drugEx}')
print(f"The r-squared is: {rvalue**2}")
print(eqn)
plt.show()