## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata_path = '/Users/jacobcortez/Documents/GitHub/DS_Repos/HW_Repo/mapplotlib_challenge/Mouse_metadata.csv'
study_results_path = '/Users/jacobcortez/Documents/GitHub/DS_Repos/HW_Repo/mapplotlib_challenge/Study_results.csv'

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
# Preview mouse data
mouse_metadata

In [None]:
# Preview study data
study_results

In [None]:
# Combine the data into a single dataset (interjoin)
merge_df = pd.merge(mouse_metadata, study_results, how = 'right',on="Mouse ID")

In [None]:
# Display the data table for preview
merge_df

In [None]:
# Check for incomplete rows
merge_df.count()

In [None]:
# Check for types in each row or column
merge_df.dtypes

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merge_df.duplicated(subset=['Mouse ID','Timepoint'])

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
merge_duplicates_df = merge_df.loc[merge_df.duplicated(subset=['Mouse ID','Timepoint'])]
merge_duplicates_df.head()


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
merge_df2 = merge_df.drop_duplicates(subset=['Mouse ID','Timepoint'])
merge_df2

In [None]:
# Checking the number of mice in the clean DataFrame.
mice_count_nunique2 = merge_df2[['Mouse ID','Timepoint']].nunique()
mice_count_nunique2

## Summary Statistics

In [None]:
# Create DF for Drug Regimen
summary_stats_df = merge_df2.groupby('Drug Regimen')
summary_stats_df.describe()

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen.
# mean, median, variance, standard deviation, and SEM of the tumor volume
tumor_volume_mean = summary_stats_df['Tumor Volume (mm3)'].mean()
tumor_volume_median = summary_stats_df['Tumor Volume (mm3)'].median()
tumor_volume_variance = summary_stats_df['Tumor Volume (mm3)'].var()
tumor_volume_std = summary_stats_df['Tumor Volume (mm3)'].std()
tumor_volume_sem = summary_stats_df['Tumor Volume (mm3)'].sem()

In [None]:
# Assemble the resulting series into a single summary dataframe.
tumor_volume_stats_df = pd.DataFrame({'Mean': tumor_volume_mean,'Median': tumor_volume_median,'Variance': tumor_volume_variance,'Standard Deviation': tumor_volume_std,'SEM': tumor_volume_sem})

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
tumor_volume_stats_df

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
tumor_volume_stats_df.agg('sum')

## Bar and Pie Charts

In [None]:
# Load DF 
merge_df2

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
DR_bar_plot = merge_df2['Drug Regimen'].value_counts().plot.bar(width=.5, zorder=7, color='r',align='center')
plt.title('Total Timepoints For All Mice Tested For Each Drug')
plt.xlabel('Drug Regimen')
plt.ylabel('Timepoints')
plt.grid(zorder=0)


In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.

# Calculate the amount of data points for each drug regimen
timepoints_counts = merge_df2['Drug Regimen'].value_counts()

# Set the x_axis to the length of the 'counts' series
x_axis = np.arange(len(timepoints_counts))

# Generate a bar blot with the length of the 'counts' series
plt.bar(x_axis, timepoints_counts, width = 0.5, zorder=3, color='r')

# Label the tick marks according to the index values in the 'counts' series
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, timepoints_counts.index.values)

# Rotate tick marks visibility
plt.xticks(rotation='vertical')

# Set labels for axes
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Timepoints')
plt.title('Total Timepoints For All Mice Tested For Each Drug')

# Create a grid - using 'zorder' places grid behind the bars since we set their z-order higher
plt.grid(zorder=0)

# Display resulting plot
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
sex_df=merge_df2['Sex'].value_counts()

sex_pie_plot = sex_df.plot.pie(autopct="%1.1f%%", title='Distribution by Sex')

sex_pie_plot.set_ylabel('')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex_df=merge_df2['Sex'].value_counts()

plt.pie(sex_df, labels=sex_df.index.values, autopct="%1.1f%%")
plt.title('Distribution by Sex')

plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
max_tp_df = pd.DataFrame(merge_df2.groupby('Mouse ID')['Timepoint'].max().sort_values())

# Reset the index & Rename
max_tp_df = pd.DataFrame(merge_df2.groupby('Mouse ID')['Timepoint'].max().sort_values()).reset_index().rename(columns={'Timepoint': 'Max_Timepoint'})

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merge_df3 = pd.merge(merge_df2, max_tp_df,on="Mouse ID")
merge_df3


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
# Create empty list to fill with tumor vol data (for plotting)
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Locate the rows which contain mice on each drug and get the tumor volumes
# add subset 
# Determine outliers using upper and lower bounds
    
drugs = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
drug_values = []

for drug in drugs:

    # Locate each drug
    drugs_df = merge_df3.loc[merge_df3['Drug Regimen'] == drug]

    # Narrow down down our DF to the columns/rows we need
    drugs_final_volume_df = drugs_df.loc[drugs_df['Timepoint'] == drugs_df['Max_Timepoint']]

    # Create DF that has final volume of each drug into drug value list
    drugs_values_df = drugs_final_volume_df['Tumor Volume (mm3)']
    drug_values.append(drugs_values_df)

    # Calculate IQR for each drug
    quartile = drugs_values_df.quantile([.25,.5,.75])
    lowerq = quartile[.25]
    upperq = quartile[.75]
    iqr = upperq - lowerq
    print(f'IQR for {drug}: {iqr} ')
    
    # Find upper and lower bounds
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f'Lower Bound for {drug}: {lower_bound}')
    print(f'Upper Bound for {drug}: {upper_bound}')

    # Check for outliers
    #drug_outliers_count_df = (drugs_values_df.loc[(drugs_final_volume_df['Tumor Volume (mm3)'] >= upper_bound)
     #                                   (drugs_final_volume_df['Tumor Volume (mm3)'] <= lower_bound)]).count()
    #print(f'Number of {drug} outliers: {drug_outliers_count_df}')

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
flierprops = dict(marker='o', markerfacecolor='r', markersize=8, markeredgecolor='black')
plt.boxplot(drug_values, flierprops=flierprops)
plt.title('Final Tumor Volume by Drug')
plt.ylabel('Final Tumor Volume (mm3)')
plt.xticks([1, 2, 3, 4], ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
mouse_capomulin = merge_df2.loc[merge_df2['Mouse ID']== 's185']
plt.plot(mouse_capomulin['Timepoint'], mouse_capomulin['Tumor Volume (mm3)'], marker = 'o')
plt.xlabel("Time (days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin Treatment of Mouse s185")

plt.show()


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

# Cases with Capomulin drug only
capomulin_df = merge_df2.loc[merge_df2['Drug Regimen']== 'Capomulin']

# Avg. Tumor Volume of Each Mouse with Capomulin drug
capomulin_avgvol_df = pd.DataFrame(capomulin_df.groupby('Mouse ID')['Tumor Volume (mm3)'].mean().sort_values()).reset_index().rename(columns={'Tumor Volume (mm3)': 'avg_tumor_vol'})

# Combine the two DF for Camomulin drug
capomulin_avgvol_df = pd.merge(capomulin_df, capomulin_avgvol_df, on='Mouse ID')
capomulin_final_avgvol_df = capomulin_avgvol_df[['Weight (g)', 'avg_tumor_vol']].drop_duplicates()
capomulin_final_avgvol_df

x = capomulin_final_avgvol_df['Weight (g)']
y = capomulin_final_avgvol_df['avg_tumor_vol']

# Create the scatter plot with new DF
plt.scatter(x, y)
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title('Average Tumor Volume by Weight')
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# Use Scatter Plot code from above
capomulin_avgvol_df = pd.merge(capomulin_df, capomulin_avgvol_df, on='Mouse ID')
capomulin_final_avgvol_df = capomulin_avgvol_df[['Weight (g)', 'avg_tumor_vol']].drop_duplicates()
capomulin_final_avgvol_df
x = capomulin_final_avgvol_df['Weight (g)']
y = capomulin_final_avgvol_df['avg_tumor_vol']

# Calculate the correlation and print answer
correlation = st.pearsonr(x,y)
print(f'The correlation between weight and average tumor volume with Capomulin regimen is {round(correlation[0],2)}.')

# Calculate linear regression
(slope, intercept, rvalue, pvalue, stderr) = linregress(x, y)
regress_values = x * slope + intercept
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2))

# Plot linear regression
plt.scatter(x,y)
plt.plot(x,regress_values,'r-')
plt.annotate(line_eq,(20,37),fontsize=15,color='red')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')
plt.title('Average Tumor Volume by Weight')

plt.show()

