## Observations and Insights 

Markdown and LaTeX: 𝛼2

# Pymaceuticals

    From the data gathered, we have discovered that there were duplicate data on a mouse after mergring the **mouse_metadata** data set to the **study_results**  dataset.  Mouse **g989**, the mouse with the duplicate data has been dropped from the dataset.  The original merged dataset had a mouse count of 249. After dropping the duplicate data, the mouse count changed to 248. 
    

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress


# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv" 
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_df = pd.merge(study_results, mouse_metadata, how="left", on='Mouse ID')
# Display the data table for preview
merge_df.head()

In [None]:
# # Checking the number of mice.
# mice_data = merge_df["Mouse ID"].value_counts()
# mice_data_df = pd.DataFrame(mice_data)
# mice_data_df.head()
len(merge_df["Mouse ID"].unique())
num_mice = len(merge_df["Mouse ID"].unique())
print(f"Number of Mice: {num_mice}")

In [None]:
# mice_df = len(mice_data)
# mice_df
# print(f"Number of Mice: {mice_df}")

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = merge_df.loc[merge_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
duplicate_mice
#####  .DUPLICATED()     IT'S RIGHT



In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_id = merge_df.loc[merge_df["Mouse ID"]=="g989"]
duplicate_mice_id

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# print all rows where mouse ID column is NOT in duplicate_mice
clean_df = merge_df[merge_df["Mouse ID"].isin(duplicate_mice)==False]
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].unique())
num_mice_clean_data = len(clean_df["Mouse ID"].unique())
print(f"Number of Mice from a Cleaned DataFrame: {num_mice_clean_data}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
average = clean_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
middle = clean_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
variance = clean_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
stan_dev = clean_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
semi = clean_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]

stat_table = pd.DataFrame({
    "Tumor Volume Mean": average,
    "Tumor Volume Median": middle,
    "Tumor Volume Variance": variance,
    "Tumor Volume Standard Deviation": stan_dev,
    "Tumor Volume Semi": semi
})

stat_table

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
#df.agg(['sum', 'min'])

stat_table_new = clean_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean", "median", "var", "std", "sem"]})
stat_table_new

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

total_num = clean_df["Drug Regimen"].value_counts()
total_num.plot(kind="bar")
plt.title("Total Number of Measurements Taken on Each Drug")
plt.ylabel("Number of Unique Mice Tested")
plt.show()

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
plt.bar(total_num.index.values, total_num.values)
plt.title("Total Number of Measurements Taken on Each Drug")
plt.xticks(rotation=90)
plt.ylabel("Number of Unique Mice Tested")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_range = clean_df["Sex"].value_counts()
#gender_range

#total_num = clean_df["Drug Regimen"].value_counts()
gender_range.plot(kind="pie", autopct="%1.1f%%")
plt.title("Male vs Female")
plt.ylabel("Sex")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

plt.pie(gender_range.values, labels=gender_range.index.values, autopct="%1.1f%%")
plt.title("Male vs Female")
#plt.xticks(rotation=90)
plt.ylabel("Sex")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

timepoint_df = clean_df.sort_values(by='Timepoint', ascending=False)
timepoint_df


In [None]:
timepoint_df = clean_df.drop_duplicates(subset=["Mouse ID"])
timepoint_df

In [None]:
# max_tumor_df = clean_df.groupby(["Mouse ID"])['Timepoint'].max()
# max_tumor_df = max_tumor.reset_index()
# max_tumor_df
timepoint_df = clean_df.groupby("Mouse ID").max()["Timepoint"]
timepoint_df = timepoint_df.reset_index()
timepoint_df

In [None]:
new_merged_data = timepoint_df.merge(clean_df,on=['Mouse ID','Timepoint'],how="left")
new_merged_data

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
four_drugs_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_volume_list = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in four_drugs_list:

    # Locate the rows which contain mice on each drug and get the tumor volumes
    final_tumor_volume = new_merged_data.loc[new_merged_data["Drug Regimen"] == drug, "Tumor Volume (mm3)"]

    # add subset 
    tumor_volume_list.append(final_tumor_volume) 

    # Determine outliers using upper and lower bounds
    quartiles = final_tumor_volume.quantile([.25,.5,.75])
    lower_quartile = quartiles[0.25]
    upper_quartile = quartiles[0.75]
    iqr = upper_quartile - lower_quartile
    
    #print(f"The lower quartile of tumor volume is: {lower_quartile}")
    #print(f"The upper quartile of tumor volume is: {upper_quartile}")
    #print(f"The interquartile range of tumor volume is: {iqr}")
    #print(f"The the median of tumor volume is: {quartiles[0.5]} ")

    lower_bound = lower_quartile - (1.5*iqr)
    upper_bound = upper_quartile + (1.5*iqr)
    outliers = final_tumor_volume.loc[(final_tumor_volume > upper_bound) | (final_tumor_volume < lower_bound)]
    print(f"______________________________________________________________")
    print(f"Number of potential outliers for {drug} is: {outliers}.")
    print(f" ")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
plt.boxplot(tumor_volume_list, labels=four_drugs_list)
ax1.set_title("Final Tumor Volume")
ax1.set_ylabel("Tumor Volume")
plt.show()

## Line and Scatter Plots

In [None]:
#capomulin_mouse.head()

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_mouse = clean_df.loc[clean_df["Drug Regimen"] == "Capomulin"]
mouse = capomulin_mouse.loc[capomulin_mouse["Mouse ID"] == "b128"]
 
plt.title("Tumor Volume over time using Capomulin")
plt.xlabel("Time")
plt.ylabel("Tumor Volume (mm3)")
plt.ylim(20,60)
plt.xlim(-2,50)
plt.plot(mouse["Timepoint"], mouse["Tumor Volume (mm3)"] ,color="red", label="Tumor Volume over time" )
plt.legend()
plt.show()

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
average_tumor_volume = capomulin_mouse.groupby("Mouse ID").mean()
average_tumor_volume

x_axis = average_tumor_volume["Weight (g)"]
y_axis = average_tumor_volume["Tumor Volume (mm3)"]

plt.title("Weight versus Tumor Volume using Capomulin")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.ylim(30,50)
plt.xlim(10,30)

plt.scatter(x_axis, y_axis, marker="o", facecolors="blue", edgecolors="red",  alpha=0.75, label="Weight versus Tumor Volume")
plt.legend()
plt.plot()
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(x_axis, y_axis)
rounded_correlation = round(correlation[0],4)
rounded_correlation
print(f"The correlation between weight and size is: {rounded_correlation}")

In [None]:
# (slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
# regress_values = x_axis * slope + intercept
# line_equation = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
# plt.title("Weight per Tumor Volume using Capomulin")

In [None]:
x_values = clean_df["Weight (g)"]
y_values = clean_df["Tumor Volume (mm3)"]

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.title("Weight per Tumor Volume using Capomulin")

plt.annotate(line_eq,(20,34),fontsize=10,color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.scatter(x_axis,y_axis)
plt.plot(x_axis,regress_values,"r-")
plt.ylim(30,50)
plt.xlim(10,30)
plt.show()