## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_df = pd.merge(mouse_metadata, study_results, how="outer", on="Mouse ID")
# Display the data table for preview
merge_df

In [None]:
# Checking the number of mice.
number_mice = merge_df["Mouse ID"].count()
number_mice

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = merge_df[merge_df.duplicated(["Mouse ID", "Timepoint"])]
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
all_duplicate_mice = merge_df[merge_df.duplicated(["Mouse ID",])]
all_duplicate_mice

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_df = merge_df.drop_duplicates("Mouse ID")
#cleaned_df = merge_df.drop_duplicates(["Mouse ID","Timepoint"], keep = 'first')
cleaned_df

In [None]:
# Checking the number of mice in the clean DataFrame.
cleaned_df["Mouse ID"].count()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

grouped_drugs = merge_df.groupby(["Drug Regimen"])
#grouped_drugs = cleaned_df.groupby(["Drug Regimen"])

In [None]:
average_tumor = grouped_drugs["Tumor Volume (mm3)"].mean()
average_tumor
median_tumor = grouped_drugs["Tumor Volume (mm3)"].median()
var_tumor = grouped_drugs["Tumor Volume (mm3)"].var()
std_tumor = grouped_drugs["Tumor Volume (mm3)"].std()
sem_tumor = grouped_drugs["Tumor Volume (mm3)"].sem()


In [None]:
summary_stats_df = pd.DataFrame({"Mean": average_tumor,
                                "Median": median_tumor,
                                "Variance": var_tumor,
                                "Standard Deviation": std_tumor,
                                "SEM": sem_tumor})
summary_stats_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
# drug_group = merge_df.groupby("Drug Regimen")

count_drugs = merge_df["Drug Regimen"].value_counts()

drug_chart = count_drugs.plot(kind='bar', title="Measurements by Drug Regimen")

drug_chart.set_xlabel("Drug Regimen")
drug_chart.set_ylabel("Measurements")

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
drug_count = merge_df["Drug Regimen"].value_counts()
num_drugs = np.arange(len(drug_count))
tick_locations = [value for value in num_drugs]

plt.bar(num_drugs, drug_count, color = 'b', alpha=0.5, align="center")
plt.xticks(tick_locations, drug_list, rotation="vertical")

# Add a title
plt.title("Measurements by Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Measurements")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_counts = merge_df["Sex"].value_counts()

pandas_pie = gender_counts.plot(kind="pie", autopct="%1.1f%%")


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Male", "Female"]

sizes = merge_df["Sex"].value_counts()

colors = ["blue","orange"]

explode = (0,0)

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%", shadow = True)
plt.title("Gender Distribution")

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
tumorvolume_groups = merge_df.groupby(["Mouse ID"])
max_tumor_df = pd.DataFrame(tumorvolume_groups["Timepoint"].max())

max_tumor_df
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

merge2_df = pd.merge(merge_df,max_tumor_df, on="Mouse ID")
merge2_df

merge2_df = merge2_df.rename(columns={"Timepoint_y": "Last Timepoint"})
merge2_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
volume_data = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
  
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
for treatment in treatments:
    
    treatments_check = merge2_df.loc["Drug Regimen"] == treatments
    
#     treatments_check.type()
    last_timepoint_df = merge2_df.loc["Timepoint"] == ["Last Timepoint"]
    
    volumes = last_timepoint_df["Tumor Volume (mmm3)"]
    
    volume_data.append(volumes)
    # add subset 
    
    print(volume_data)
    # Determine outliers using upper and lower bounds
quartiles = volume_data.quantile([.25,.5,.75])
lowerq= quartiles[0.25]
upperq=quartiles[0.75]
iqr = upperq-lowerq

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volumes for Each Mouse by Drug Regimen")
ax1.set_ylabel("Final Tumor Volume")
ax1.boxplot(volume_data)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
mouse_choice = merge_df[merge_df["Mouse ID"] == "s185"]

x_axis = mouse_choice["Timepoint"]
y_axis = mouse_choice["Tumor Volume (mm3)"]

plt.plot(x_axis, y_axis)
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Tumor Volume (mm3) over Time for Mouse s185")

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
capomulin_regimen = merge_df[merge_df["Drug Regimen"] == "Capomulin"]

x_axis_capo = capomulin_regimen["Weight (g)"]
y_axis_capo = capomulin_regimen["Tumor Volume (mm3)"].mean()

plt.scatter(x_axis_capo, y_axis_capo, marker= "o", facecolors="red", edgecolors="black")

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
