## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import os
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = os.path.join("data", "Mouse_metadata.csv")
study_results_path = os.path.join("data", "Study_results.csv")

# Read the mouse data and the study results

mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
data_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')

# Display the data table for preview
data_df.head()

In [None]:
# Checking the number of mice.
data_df['Mouse ID'].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = [data_df.groupby(['Mouse ID', 'Timepoint']).count() > 1][0]
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
data_df.duplicated(subset=['Mouse ID', 'Timepoint']).value_counts()

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Deleted all duplicates and kept the first entry only
no_duplicate_df = data_df.drop_duplicates('Mouse ID', keep='first')
no_duplicate_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
no_duplicate_df['Mouse ID'].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
summary_df = data_df.groupby('Drug Regimen')

# mean, median, variance, standard deviation, and SEM of the tumor volume
mean = summary_df['Tumor Volume (mm3)'].mean()
median = summary_df['Tumor Volume (mm3)'].median()
var = summary_df['Tumor Volume (mm3)'].var()
std = summary_df['Tumor Volume (mm3)'].std()
sem = summary_df['Tumor Volume (mm3)'].sem()


# Assemble the resulting series into a single summary dataframe.
result_df = pd.DataFrame({'Mean': mean,
                          'Median': median,
                          'Variance': var,
                          'Standard Deviation': std,
                          'SEM': sem})

result_df.head()


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
summary_agg_df = summary_df.agg({'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std', 'sem']})
summary_agg_df.head()

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
bar_plot = data_df['Timepoint'].value_counts().plot.bar(width=0.7)

bar_plot.set_xlabel('Timepoint')
bar_plot.set_ylabel('Number of Points')
bar_plot.set_title('Number of Points for each Timepoint')

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
count = data_df['Timepoint'].value_counts()

x_axis = np.arange(len(count))
plt.bar(x_axis, count, width = 0.7)
tick_location = [value for value in x_axis]
plt.xticks(tick_location, count.index.values)

plt.xticks(rotation = 90)

plt.xlabel('Timepoint')
plt.ylabel('Number of Points')
plt.title('Number of Points for each Timepoint')

plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_data = data_df['Sex'].value_counts()

pie_plot = gender_data.plot.pie(autopct = '%1.1f%%', startangle = 90, title = 'Distribution by Sex')

pie_plot.set_ylabel('')


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_data = data_df['Sex'].value_counts()

plt.pie(gender_data, labels = gender_data.index.values, autopct = '%1.1f%%', startangle = 90)
plt.title('Distribution by Sex')

plt.show()


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin


# Start by getting the last (greatest) timepoint for each mouse
max_timepoint_df = pd.DataFrame(data_df.groupby('Mouse ID')['Timepoint'].max().sort_values()).reset_index().rename(columns = {'Timepoint': 'max_timepoint'})

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_df = pd.merge(data_df, max_timepoint_df, on = 'Mouse ID')
merged_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']


# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for treatment in treatments:
    temp_df = merged_df.loc[merged_df['Drug Regimen'] == treatment]
    final_volume_df = temp_df.loc[temp_df['Timepoint'] == temp_df['max_timepoint']]
    vol = final_volume_df['Tumor Volume (mm3)']
    tumor_vol_data.append(vol)

    quartiles = vol.quantile([.25,.5,.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    inter_quartile_range = upper_quartile-lower_quartile
    print(f'IQR for {treatment}: {inter_quartile_range}')
    
    # Determine outliers using upper and lower bounds
    lower_bound = lower_quartile - (1.5 * inter_quartile_range)
    upper_bound = upper_quartile - (1.5 * inter_quartile_range)

    print(f'Lower Bound for {treatment}: {lower_bound}')
    print(f'Upper Bound for {treatment}: {upper_bound}')

     # Outliers check
    outliers_count = (vol.loc[(final_volume_df['Tumor Volume (mm3)'] >= upper_bound) | 
                                        (final_volume_df['Tumor Volume (mm3)'] <= lower_bound)]).count()
    print(f'Number of {treatment} outliers: {outliers_count}')
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

# Plot boxplot
plt.boxplot(tumor_vol_data, flierprops=dict(marker='o', markerfacecolor='r', markersize=8, markeredgecolor='black'))

# Add title and labels to boxplot
plt.title('Final Tumor Volume by Drug')
plt.ylabel('Final Tumor Volume (mm3)')
plt.xticks([1, 2, 3, 4], ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

mouse = data_df.loc[data_df['Mouse ID'] == 's185']

plt.plot(mouse['Timepoint'], mouse['Tumor Volume (mm3)'], marker = 'o')

# Add labels and title to plot
plt.xlabel("Time (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin Treatment of Mouse s185")

plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

capomulin_df = data_df.loc[data_df['Drug Regimen'] == 'Capomulin']

# Find average tumor volume for each mouse
avg_vol_df = pd.DataFrame(capomulin_df.groupby('Mouse ID')['Tumor Volume (mm3)'].mean().sort_values()).reset_index().rename(columns={'Tumor Volume (mm3)': 'avg_tumor_vol'})

# Merge average tumor volume onto data_df and drop duplicates
avg_vol_df = pd.merge(capomulin_df, avg_vol_df, on='Mouse ID')
final_avg_vol_df = avg_vol_df[['Weight (g)', 'avg_tumor_vol']].drop_duplicates()
final_avg_vol_df

x = final_avg_vol_df['Weight (g)']
y = final_avg_vol_df['avg_tumor_vol']

# Create a scatter plot
plt.scatter(x, y)

# Add labels and title to plot
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title('Average Tumor Volume by Weight')

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# Calculate the correlation coefficient between mouse weight and average tumor volume
correlation = st.pearsonr(x,y)

# Print the answer to above calculation
print(f"""The correlation between weight and average tumor volume
on the Capomulin regimen is {round(correlation[0],2)}.""")

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
regress_values = x * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Plot linear regression on top of scatter plot
plt.scatter(x,y)
plt.plot(x,regress_values,"r-")

# Annotate linear regression
plt.annotate(line_eq,(20,37),fontsize=15,color="red")

# Add labels and title to plot
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title('Average Tumor Volume by Weight')
plt.show()
