# Pymaceuticals Inc.
---

### Analysis

The following observations can be gathered from this analysis:
- Capomulin and Ramicane are closely comparable in terms of efficacy.
- An even distribution of male and female mice were used in the study.
- There is a strong, positive correlation between tumor volume and tumor weight in the Capomulin regimen.
- There was only one outlier identified, within the Infubinol regimen. This regimen also had the greatest median final tumor volume.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data
mouse_metadata = pd.read_csv(mouse_metadata_path)
mouse_metadata

In [None]:
# Read the study results
study_results = pd.read_csv(study_results_path)
study_results

In [None]:
# Combine the data into a single DataFrame
combined_data = pd.merge(mouse_metadata, study_results, on='Mouse ID')

# Display the data table for preview
combined_data.head()

In [None]:
# Checking the number of mice.
num_unique_mice = combined_data['Mouse ID'].nunique()
print("Number of mice:", num_unique_mice)

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice = combined_data[combined_data.duplicated(["Mouse ID","Timepoint"])]
duplicate_mice = pd.DataFrame({"Mouse ID":duplicate_mice["Mouse ID"].unique()})
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID.
combined_data.loc[combined_data["Mouse ID"]==str(duplicate_mice.iloc[0,0]),:]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
drop_index = combined_data[combined_data["Mouse ID"]==str(duplicate_mice.iloc[0,0])].index
combined_clean = combined_data.drop(drop_index)
combined_clean.reset_index(inplace=True)
combined_clean.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
pd.DataFrame({"Total Subjects":[len(combined_clean["Mouse ID"].unique())]})

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen.
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
# mean, median, variance, standard deviation, and SEM of the tumor volume.
# Assemble the resulting series into a single summary DataFrame.

mean_regimen = combined_clean.groupby("Drug Regimen")[["Tumor Volume (mm3)"]].mean()
med_regimen = combined_clean.groupby("Drug Regimen")[["Tumor Volume (mm3)"]].median()
var_regimen = combined_clean.groupby("Drug Regimen")[["Tumor Volume (mm3)"]].var()
stdev_regimen = combined_clean.groupby("Drug Regimen")[["Tumor Volume (mm3)"]].std()
SEM_regimen = combined_clean.groupby("Drug Regimen")[["Tumor Volume (mm3)"]].sem()
summary_stats_regimen = pd.merge(mean_regimen,med_regimen,on="Drug Regimen")
summary_stats_regimen = pd.merge(summary_stats_regimen,var_regimen,on="Drug Regimen")
summary_stats_regimen.rename(columns={
    "Tumor Volume (mm3)_x":"Mean Tumor Volume",
    "Tumor Volume (mm3)_y":"Median Tumor Volume"
},inplace=True)
summary_stats_regimen = pd.merge(summary_stats_regimen,stdev_regimen,on="Drug Regimen")
summary_stats_regimen = pd.merge(summary_stats_regimen,SEM_regimen,on="Drug Regimen")
summary_stats_regimen.rename(columns={
    "Tumor Volume (mm3)_x":"Tumor Volume Variance",
    "Tumor Volume (mm3)_y":"Tumor Volume Std. Dev.",
    "Tumor Volume (mm3)":"Tumor Volume Std. Err."
},inplace=True)
summary_stats_regimen

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
aggregation_summary = combined_clean.groupby(['Drug Regimen'])[['Tumor Volume (mm3)']].agg(['mean', 'median', 'var', 'std', 'sem'])
aggregation_summary

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
mice_count = combined_clean["Drug Regimen"].value_counts()
mice_count

In [None]:
mice_count_pandasplot = mice_count.plot(kind="bar", title="Observed Mouse Timepoints per Drug Regimen", legend=False)
mice_count_pandasplot.set_ylabel("# of Observed Mouse Timepoints")
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
plt.figure(figsize=(8,6))
plt.bar(mice_count.index, mice_count.values)
plt.title("Observed Mouse Timepoints per Drug Regimen")
plt.xlabel('Drug Regimen')
plt.ylabel("# of Observed Mouse Timepoints")
plt.xticks(rotation=90)
plt.show()

In [None]:
unique_mice_gender = combined_clean.drop_duplicates(subset='Mouse ID')[['Mouse ID', 'Sex']]

In [None]:
# Generate a pie chart, using Pandas, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender
gender_counts = unique_mice_gender['Sex'].value_counts()

# Make the pie chart
gender_counts.plot(kind='pie', autopct='%1.1f%%', startangle=360)
plt.ylabel('count')
plt.title('Distribution of Sex', loc='center', pad=-20, rotation=360, fontsize=12)
plt.show()

In [None]:
# Generate a pie chart, using pyplot, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender
gender_counts_pylot = unique_mice_gender['Sex'].value_counts()

# Make the pie chart
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=360)
plt.ylabel('count')
plt.title('Distribution of Sex', loc='center', pad=-20, rotation=360, fontsize=12)
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin

Capomulin = combined_clean.loc[combined_clean["Drug Regimen"] == "Capomulin",:]
Ramicane = combined_clean.loc[combined_clean["Drug Regimen"] == "Ramicane", :]
Infubinol = combined_clean.loc[combined_clean["Drug Regimen"] == "Infubinol", :]
Ceftamin = combined_clean.loc[combined_clean["Drug Regimen"] == "Ceftamin", :]

In [None]:
# Start by getting the last (greatest) timepoint for each mouse (Capomulin)
Capomulin_final = Capomulin.groupby('Mouse ID').max()['Timepoint']
Capomulin_vol = pd.DataFrame(Capomulin_final)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
Capomulin_merge = pd.merge(Capomulin_vol, combined_clean, on=("Mouse ID","Timepoint"),how="left")
Capomulin_merge.head()

In [None]:
# Start by getting the last (greatest) timepoint for each mouse (Ramicane)
Ramicane_final = Ramicane.groupby('Mouse ID').max()['Timepoint']
Ramicane_vol = pd.DataFrame(Ramicane_final)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
Ramicane_merge = pd.merge(Ramicane_vol, combined_clean, on=("Mouse ID","Timepoint"),how="left")
Ramicane_merge.head()

In [None]:
# Start by getting the last (greatest) timepoint for each mouse (Infubinol)
Infubinol_final = Infubinol.groupby('Mouse ID').max()['Timepoint']
Infubinol_vol = pd.DataFrame(Infubinol_final)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
Infubinol_merge = pd.merge(Infubinol_vol, combined_clean, on=("Mouse ID","Timepoint"),how="left")
Infubinol_merge.head()

In [None]:
# Start by getting the last (greatest) timepoint for each mouse (Ceftamin)
Ceftamin_final = Ceftamin.groupby('Mouse ID').max()['Timepoint']
Ceftamin_vol = pd.DataFrame(Ceftamin_final)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
Ceftamin_merge = pd.merge(Ceftamin_vol, combined_clean, on=("Mouse ID","Timepoint"),how="left")
Ceftamin_merge.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
regimens = ["Capomulin","Ramicane","Infubinol","Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
final_tumor_vol = []

# Calculate the IQR and quantitatively determine if there are any potential outliers.

    # Locate the rows which contain mice on each drug and get the tumor volumes
for drug in regimens:
    max_drug = max_times.loc[max_times["Drug Regimen"]==drug,:]
    quartiles = max_drug["Final Tumor Vol"].quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq

    # add subset
    final_tumor_vol.append(max_drug["Final Tumor Vol"])

    # Determine outliers using upper and lower bounds
    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)
    
    print(f"The lower quartile of Final Tumor Vol for {drug} is: {lowerq}")
    print(f"The upper quartile of Final Tumor Vol for {drug} is: {upperq}")
    print(f"The interquartile range of Final Tumor Vol for {drug} is: {iqr}")
    print(f"The median of Final Tumor Vol for {drug} is: {quartiles[0.5]}")
    
    outlier_vol = max_drug.loc[(max_drug['Final Tumor Vol'] < lower_bound) | (max_drug['Final Tumor Vol'] > upper_bound)]
    
    if len(outlier_vol) == 0:
        print(f"\n{drug} has no potential outliers\n")
        print("-------------------------------------")
    else:
        print(f"{drug}'s potential outliers:")
        print(outlier_vol.to_string(index=False))
        print("-------------------------------------")

In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group.
fig, ax = plt.subplots()
ax.boxplot(final_tumor_vol, flierprops=dict(markerfacecolor='red'))

ax.set_title('Final Tumor Volumes by Treatment')
ax.set_ylabel('Tumor Volume (mm3)')

# Set the tick locations and labels on the x-axis
ax.set_xticks(range(1, len(regimens) + 1))
ax.set_xticklabels(regimens)

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
progression_line = Capomulin.loc[Capomulin["Mouse ID"] == "l509",:]
x_axis = progression_line["Timepoint"]
y_axis = progression_line["Tumor Volume (mm3)"]

fig1, ax1 = plt.subplots()
plt.title('Capomulin treatment of mouse l509')
plt.plot(x_axis, y_axis, linewidth=2, markersize=15)
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
vol_by_weight = combined_clean.loc[combined_clean["Drug Regimen"]=="Capomulin",:].groupby("Mouse ID")[["Weight (g)","Tumor Volume (mm3)"]].mean()
plt.scatter(vol_by_weight["Weight (g)"],vol_by_weight["Tumor Volume (mm3)"])
plt.title("Capomulin Tumor Volume by Weight")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model

#Correlation Coefficient:
correlation = st.pearsonr(vol_by_weight['Weight (g)'],vol_by_weight['Tumor Volume (mm3)'])
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}")

#Linear Regression:
from scipy.stats import linregress
(slope, intercept,rvalue, pvalue, stderr)= linregress(vol_by_weight["Weight (g)"],vol_by_weight["Tumor Volume (mm3)"])
regress_values=vol_by_weight["Weight (g)"]* slope + intercept
line_eq= f"y = {round(slope, 2)} x + {round(intercept, 2)}"

In [None]:
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
marker_size=15
plt.scatter(vol_by_weight["Weight (g)"],vol_by_weight["Tumor Volume (mm3)"])
plt.plot(vol_by_weight["Weight (g)"], regress_values, color='red')
plt.annotate(line_eq,(20,36), fontsize=14)
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title("Capomulin Tumor Volume by Weight")
print(f"The r-squared is: {round(rvalue**2,3)}")
plt.show()