## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv" 
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_df = pd.merge(study_results, mouse_metadata, how="left", on='Mouse ID')
# Display the data table for preview
merge_df.head()

In [None]:
# # Checking the number of mice.
# mice_data = merge_df["Mouse ID"].value_counts()
# mice_data_df = pd.DataFrame(mice_data)
# mice_data_df.head()
len(merge_df["Mouse ID"].unique())
num_mice = len(merge_df["Mouse ID"].unique())
print(f"Number of Mice: {num_mice}")

In [None]:
# mice_df = len(mice_data)
# mice_df
# print(f"Number of Mice: {mice_df}")

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = merge_df.loc[merge_df.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
duplicate_mice
#####  .DUPLICATED()     IT'S RIGHT



In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_id = merge_df.loc[merge_df["Mouse ID"]=="g989"]
duplicate_mice_id

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# print all rows where mouse ID column is NOT in duplicate_mice
clean_df = merge_df[merge_df["Mouse ID"].isin(duplicate_mice)==False]
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(clean_df["Mouse ID"].unique())
num_mice_clean_data = len(clean_df["Mouse ID"].unique())
print(f"Number of Mice from a Cleaned DataFrame: {num_mice_clean_data}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
average = clean_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
middle = clean_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
variance = clean_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
stan_dev = clean_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
semi = clean_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]

stat_table = pd.DataFrame({
    "Tumor Volume Mean": average,
    "Tumor Volume Median": middle,
    "Tumor Volume Variance": variance,
    "Tumor Volume Standard Deviation": stan_dev,
    "Tumor Volume Semi": semi
})

stat_table

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line
#df.agg(['sum', 'min'])

stat_table_new = clean_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)":["mean", "median", "var", "std", "sem"]})
stat_table_new

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.

total_num = clean_df["Drug Regimen"].value_counts()
total_num.plot(kind="bar")
plt.title("Total Number of Measurements Taken on Each Drug")
plt.ylabel("Number of Unique Mice Tested")
plt.show()

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
plt.bar(total_num.index.values, total_num.values)
plt.title("Total Number of Measurements Taken on Each Drug")
plt.xticks(rotation=90)
plt.ylabel("Number of Unique Mice Tested")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_range = clean_df["Sex"].value_counts()
#gender_range

#total_num = clean_df["Drug Regimen"].value_counts()
gender_range.plot(kind="pie", autopct="%1.1f%%")
plt.title("Male vs Female")
plt.ylabel("Sex")
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

plt.pie(gender_range.values, labels=gender_range.index.values, autopct="%1.1f%%")
plt.title("Male vs Female")
#plt.xticks(rotation=90)
plt.ylabel("Sex")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
