## Observations and Insights 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_merge_df = pd.merge(mouse_metadata, study_results, on='Mouse ID')

# Display the data table for preview
mouse_merge_df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/Mouse_metadata.csv'

In [None]:
# Checking the number of mice
unique_mice_df = len(mouse_merge_df['Mouse ID'].unique())
unique_mice_df


In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_merge_df.loc[mouse_merge_df.duplicated(subset=['Mouse ID', 'Timepoint'])]


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_merge_df.loc[mouse_merge_df['Mouse ID'] == 'g989']


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_mouse_df = mouse_merge_df.loc[mouse_merge_df['Mouse ID'] != 'g989']
cleaned_mouse_df.head()


In [None]:
# Checking the number of mice in the clean DataFrame.
cleaned_unique_mice_df = len(cleaned_mouse_df['Mouse ID'].unique())
cleaned_unique_mice_df


## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 

grouped_mouse_df = cleaned_mouse_df.groupby(['Drug Regimen'])
grouped_mouse_df.head()

tumor_mean = grouped_mouse_df['Tumor Volume (mm3)'].mean()

tumor_median = grouped_mouse_df['Tumor Volume (mm3)'].median()

tumor_var = grouped_mouse_df['Tumor Volume (mm3)'].var()

std_dev = grouped_mouse_df['Tumor Volume (mm3)'].std()

tumor_sem = grouped_mouse_df['Tumor Volume (mm3)'].sem()

# Assemble the resulting series into a single summary dataframe.
summary_df = pd.DataFrame({'Mean Tumor Volume': tumor_mean, 
                           'Median Tumor Volume': tumor_median, 
                           'Tumor Variance': tumor_var,
                           'Std Dev': std_dev, 
                           'Tumor SEM': tumor_sem})
summary_df

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_df = pd.DataFrame({'Mean Tumor Volume': tumor_mean, 
                           'Median Tumor Volume': tumor_median, 
                           'Tumor Variance': tumor_var,
                           'Std Dev': std_dev, 
                           'Tumor SEM': tumor_sem})

# Using the aggregation method, produce the same summary statistics in a single line

agg_grouped_mouse_df = grouped_mouse_df.agg({'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std', 'sem']})
agg_grouped_mouse_df


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.

mice_drug_df = cleaned_mouse_df.groupby(['Drug Regimen'])
unique_mice_drug = mice_drug_df['Mouse ID']
# num_unique = unique_mice_drug_df.count()
#print(num_unique)
unique_mice_drug
## 10




In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
x_axis = ['Capomulin', 'Ceftamin', 'Infubinol', 'Katapril', 'Naftisol', 'Placebo', 'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol']
y_axis = unique_mice_drug

plt.bar(x-axis, y_axis)
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice Tested')
plt.xticks(rotation=90)

plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
###


# Start by getting the last (greatest) timepoint for each mouse

## quartiles = 
# lowerq = quartiles[0.25]
# upperq = quartiles[0.75]
# iqr = upperq - lowerq


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)

## treatments = ['Drug Regimen']



# Create empty list to fill with tumor vol data (for plotting)

## tumor_vol_data = []


# Calculate the IQR and quantitatively determine if there are any potential outliers. 


## iqr = upperq - lowerq


    # Locate the rows which contain mice on each drug and get the tumor volumes .loc
    
    
    # add subset get series and put into empty tomor vol list
    
    
    # Determine outliers using upper and lower bounds 
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

# one boxplot with all 4 regimens

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin





In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

## plt.scatter(who_data.iloc[:,1],who_data.iloc[:,8])
## plt.xlabel('Income Per Capita')
## plt.ylabel('Average Alcohol Consumed Per Person Per Year (L)')
## plt.show()



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

## income = who_data.iloc[:,1]
## alcohol = who_data.iloc[:,8]
## correlation = st.pearsonr(income,alcohol)
## print(f"The correlation between both factors is {round(correlation[0],2)}")




## x_values = housing_data['RM']
## y_values = housing_data['MEDV']
## (slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
## regress_values = x_values * slope + intercept
## line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
## plt.scatter(x_values,y_values)
## plt.plot(x_values,regress_values,"r-")
## plt.annotate(line_eq,(6,10),fontsize=15,color="red")
## plt.xlabel('Rooms in House')
## plt.ylabel('Median House Prices ($1000)')
## plt.show()