## Observations and Insights 

Three observations on the pymaceutical data:
1. There is a reasonable positive correlation between average mouse weight and tumor volume. 
2. When comparing the Ramicane, Capomulin, Infubinol, and Ceftamin, the mean final tumor volume of Ramicane and Capomulin is significantly less than Infubinol and Ceftamin. However, as Ramicane and Capomulin have more measurements than Infubinol and Ceftamin, and Infubinol has a curious outlier, it may be a good idea to get more data to confirm this trend.
3. There is a considerable difference in the mean of the Placebo and Capomulin over the entire data set; this suggests that there is an impact on tumor volume due to this treatment (and not to any other environmental factors).

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Use .info() to get summary information for each data set
mouse_metadata.info()
study_results.info()


In [None]:
# Combine the data into a single dataset
data=pd.merge(study_results,mouse_metadata, on="Mouse ID")

# Display the data table for preview
data.head(3)

In [None]:
# Checking the number of mice by grouping by Mouse ID and counting the number of rows
ID_group=data.groupby(["Mouse ID"])
mouse_count=len(ID_group)
print(f"Number of rows in Mouse ID column:{mouse_count}")


In [None]:
# Getting the duplicate mice by ID number
# Select duplicate rows based on Mouse ID and Timepoint using .duplicated()
# Since keep=False, all duplicates will be listed
# The df duplicate lists data for duplicates in both Mouse ID and Timepoint
duplicate = data[data.duplicated(subset=["Timepoint","Mouse ID"], keep=False)]

# The unique function is used to determine the duplicate Mouse ID
duplicate_ID = duplicate['Mouse ID'].unique()

# The nunique function counts the number of duplicates, i.e. number of duplicate unique Mouse IDs
number_of_duplicates = duplicate['Mouse ID'].nunique()

# The number of duplicates, duplicate Mouse ID, and the duplicate data frame are printed to determine what should be removed
print(f"There is {number_of_duplicates} duplicate(s) and the Mouse ID is {duplicate_ID}.The row(s) belonging to this Mouse ID are below.")
duplicate

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# data_clean is a new dataframe that contains all rows except those with the Mouse ID g989
data_clean = data[data['Mouse ID']!="g989"]


In [None]:
# Checking the number of mice in the clean DataFrame.
ID_group_clean = data_clean.groupby(["Mouse ID"])
mouse_count_clean = len(ID_group_clean)
print(f"Number of rows in Mouse ID column:{mouse_count_clean}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the properties of each drug regimen: 
# group clean data set by drug regimen
data_clean_regimen = data_clean.groupby(["Drug Regimen"])

# define tumor_volume as the tumor_volume data column from the grouped dataframe
tumor_volume = data_clean_regimen['Tumor Volume (mm3)']

# calculate summary statistics
mean_tumor = tumor_volume.mean()
median_tumor = tumor_volume.median()
stdev_tumor = tumor_volume.std()
var_tumor = tumor_volume.var()
sem_tumor = tumor_volume.sem()

# concatanate summary statistics in a new dataframe and add new column titles as keys
data_summary = pd.concat([mean_tumor, median_tumor, stdev_tumor, var_tumor, sem_tumor], axis=1, keys = ['Mean Tumor Vol. (mm3)','Median Tumor Vol. (mm3)','Tumor Vol. St. Dev.','Tumor Vol. Var. (mm3)','Tumor Vol. SEM'])
data_summary



In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
summary = data_clean.groupby('Drug Regimen').agg({'Tumor Volume (mm3)': ['mean', 'median', 'std','var','sem']})
summary

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

# count the number of Mouse IDs for each drug
mouse_count = pd.DataFrame(data_clean.groupby(['Drug Regimen'])['Mouse ID'].count())

# Bar chart of mouse_count using pandas df.plot() with kind = bar; x and y labels are also added.
mouse_chart = mouse_count.plot(kind="bar", color="b", rot=45, legend=False)
mouse_chart.set_xlabel("Drug Regimen")
mouse_chart.set_ylabel("Number of Measurements")

plt.show()


In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
# create lists from mouse_count df as input values for pyplot
drug_list = list(mouse_count.index.values)
measurements = mouse_count['Mouse ID'].tolist()

#create bar chart using pyplot
plt.bar(drug_list, measurements, color='b', align="center")
plt.xticks(rotation=45)
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Measurements")




In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
# count the number of male and female mice using .count()
count = pd.DataFrame(data_clean.groupby(['Sex'])['Mouse ID'].count())

# rename columns to control label (there is probably another way to do this ¯\_(ツ)_/¯)
sex_count=count.rename(columns={"Sex": "Sex", "Mouse ID": "Sex of Mice"})

# Pie chart of sex_count using pandas df.plot.pie() 
pie = sex_count.plot.pie(y='Sex of Mice',autopct='%1.1f%%', shadow=True, startangle=15, legend=False )



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
# create lists from sex_count df as input values for pyplot
sex = list(sex_count.index.values)
m_f_count = sex_count['Sex of Mice'].tolist()

#create pie chart using pyplot and add y lable to match pandas pie chart
plt.pie(m_f_count, labels=sex, autopct="%1.1f%%", shadow=True, startangle=15)
plt.ylabel("Sex of Mice")



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
# create a new dataframe that is grouped by Mouse ID and provides the maximum Timepoint for each Mouse ID
max_time = data_clean.groupby('Mouse ID')['Timepoint'].max().reset_index()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_time = max_time.merge(data_clean, left_on=(max_time['Mouse ID'], 'Timepoint'), right_on=(data_clean['Mouse ID'], 'Timepoint'), how='left')

# print out merged dataframe to check
merge_time

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data 
tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# use a for loop to get values for each drug of interest and append to list
for treatment in treatments:
    
    # create a subset data frame with tumor volume data from a particular drug regimen
    sub_df = merge_time.loc[merge_time['Drug Regimen'] == treatment, ['Tumor Volume (mm3)']]
    
    # calculate quartiles to get lower/upper quartiles, iqr, and lower/upper bounds
    quartiles = sub_df['Tumor Volume (mm3)'].quantile([0.25,0.5,0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
    # append each list of calculations to the tumor_vol list
    tumor_vol.append({'Drug Regimen':treatment, 'Tumor Volume Lower Q (mm3)':lowerq, 'Tumor Volume Upper Q (mm3)':upperq, 'Tumor Volume IQR (mm3)':iqr, 'Tumor Volume Lower Bound (mm3)':lower_bound,'Tumor Volume Upper Bound (mm3)':upper_bound})

# make a dataframe from appended tumor volume list
iqr_df = pd.DataFrame(tumor_vol)
iqr_df          
    

In [None]:
# Determine outliers using upper and lower bounds
# Create dataframe with final tumor volumes and only the treatments of interest

capomulin_1 = merge_time.loc[merge_time['Drug Regimen'] == "Capomulin", ['Tumor Volume (mm3)']]
capomulin_2 = capomulin_1.reset_index(drop=True)
capomulin = capomulin_2.rename(columns={"Tumor Volume (mm3)":"Capomulin"})
ramicane_1 = merge_time.loc[merge_time['Drug Regimen'] == "Ramicane", ['Tumor Volume (mm3)']]
ramicane_2 = ramicane_1.reset_index(drop=True)
ramicane = ramicane_2.rename(columns={"Tumor Volume (mm3)":"Ramicane"})
infubinol_1 = merge_time.loc[merge_time['Drug Regimen'] == "Infubinol", ['Tumor Volume (mm3)']]
infubinol_2 = infubinol_1.reset_index(drop=True)
infubinol = infubinol_2.rename(columns={"Tumor Volume (mm3)":"Infubinol"})
ceftamin_1 = merge_time.loc[merge_time['Drug Regimen'] == "Ceftamin", ['Tumor Volume (mm3)']]
ceftamin_2 = ceftamin_1.reset_index(drop=True)
ceftamin = ceftamin_2.rename(columns={"Tumor Volume (mm3)":"Ceftamin"})

result = pd.concat([capomulin, ramicane, infubinol, ceftamin], axis=1)

# I got stuck here on trying to print any values in this dataframe that are greater than or less than the bounds

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
boxplot = result.boxplot(column=['Capomulin', 'Ramicane', 'Infubinol','Ceftamin'],grid=False)


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
# create subset dataframe with only Capomulin data and one mouse (choosing b128)
capomulin_b128 = data_clean[(data_clean['Drug Regimen']=='Capomulin') & (data_clean['Mouse ID']=='b128')]
#capomulin_b128
x_values = capomulin_b128['Timepoint']
y_values = capomulin_b128['Tumor Volume (mm3)']

plt.plot(x_values, y_values, color="red", label="Mouse b128")

#Add labels to X and Y axes :: Add title
plt.title("Tumor Volume vs. Timepoint with Capomulin Treatment")
plt.xlabel("Time Point (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.legend(loc="best")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
# df with only Capomul data
capomulin = data_clean[(data_clean['Drug Regimen']=='Capomulin')]

# Grouby Mouse ID and calculate averages for all column values using .mean()
capomulin_group_avg =  capomulin.groupby(['Mouse ID']).mean()

# Plot avg tumor volume vs. mouse weight
# define x and y values from capomulin avg df
x_values_c = capomulin_group_avg['Weight (g)']
y_values_c = capomulin_group_avg['Tumor Volume (mm3)']

# Use plt.scatter and other parameters to create the plot
plt.scatter(x_values_c,y_values_c)
plt.title("Avg. Tumor Volume vs. Avg. Mouse Weight with Capomulin Treatment")
plt.xlabel('Avg. Weight (g)')
plt.ylabel('Avg. Tumor Volume (mm3)')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# Define x and y data from Capomulin average dataframe 
x_values_c = capomulin_group_avg['Weight (g)']
y_values_c = capomulin_group_avg['Tumor Volume (mm3)']

# Import lineregress from scipy.stats
from scipy.stats import linregress

# Define values using lineregress
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values_c, y_values_c)
regress_values = x_values_c * slope + intercept

# Define line equation
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plot data, line, and equation of the line (using plt.annotate)
plt.scatter(x_values_c,y_values_c)
plt.plot(x_values_c,regress_values,"r-")
plt.annotate(line_eq,(20,36),fontsize=15,color="red")
plt.title("Avg. Tumor Volume vs. Avg. Mouse Weight with Capomulin Treatment")
plt.xlabel('Avg. Weight (g)')
plt.ylabel('Avg. Tumor Volume (mm3)')
plt.show()

# Calculate the correlation coefficient between avg. tumor volume and avg. mouse weight with capomulin treatment 
# using st.pearsonr() function
correlation = st.pearsonr(x_values_c,y_values_c)
print(f'The correlation between mouse weight and average tumor volume is {round(correlation[0],2)}.')
