## Observations and Insights 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np 

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mice_data_df=pd.merge(mouse_metadata, study_results, on='Mouse ID', how='outer')

# Display the data table for preview
mice_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
# Checking the number of mice.
number_of_mice = len(mice_data_df)
number_of_mice

1893

In [4]:
number_of_mice = mice_data_df['Mouse ID'].unique()
len(number_of_mice)

249

In [5]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
group_mice = mice_data_df.groupby('Drug Regimen')
duplicate_mice_df = group_mice[['Timepoint']].count()
duplicate_mice_df

Unnamed: 0_level_0,Timepoint
Drug Regimen,Unnamed: 1_level_1
Capomulin,230
Ceftamin,178
Infubinol,178
Ketapril,188
Naftisol,186
Placebo,181
Propriva,161
Ramicane,228
Stelasyn,181
Zoniferol,182


In [6]:
# Optional: Get all the data for the duplicate mouse ID. 



In [7]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.  
mice_data_df.sort_values("Mouse ID", inplace = True) 
clean_data_df = mice_data_df.drop_duplicates(subset = 'Mouse ID', keep = 'last') 
clean_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
454,a203,Infubinol,Female,20,23,0,45.000000,0
464,a251,Infubinol,Female,21,25,0,45.000000,0
474,a262,Placebo,Female,17,29,0,45.000000,0
493,a275,Ceftamin,Female,20,28,45,62.999356,3
495,a366,Stelasyn,Female,16,29,5,48.951368,0
...,...,...,...,...,...,...,...,...
1862,z435,Propriva,Female,12,26,10,48.710661,0
76,z578,Ramicane,Male,11,16,5,42.188787,0
1867,z581,Infubinol,Female,24,25,20,51.807944,2
1877,z795,Naftisol,Female,13,29,20,53.280657,1


In [8]:
# Checking the number of mice in the clean DataFrame.
number_of_mice = len(clean_data_df)

## Summary Statistics

In [17]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
#Making a table with only Tumor Volume and Drug Regimen 
tumor_volume = clean_data_df[['Tumor Volume (mm3)','Drug Regimen']]
regimen_df = tumor_volume.groupby('Drug Regimen')

#Taking the mean of tumor volume 
tumor_mean = regimen_df.mean()

#Taking the median of tumor volume 
tumor_median = regimen_df.median()

#Merging into one summary table 
sum_data_df = pd.merge(tumor_mean, tumor_median, on='Drug Regimen', how='outer')

sum_data_df = sum_data_df.rename(columns={"Tumor Volume (mm3)_x" : "Mean", "Tumor Volume (mm3)_y" : "Median"})

#Taking the Variance of tumor volume 
tumor_var = regimen_df.var()

#Taking the Standard Deviation of tumor volume 
tumor_std = regimen_df.std()

#Merging into summary table
sum_data_df = pd.merge(sum_data_df, tumor_var, on='Drug Regimen', how='outer')
sum_data_df = pd.merge(sum_data_df, tumor_std, on='Drug Regimen', how='outer')

#Renaming the Columns 
sum_data_df = sum_data_df.rename(columns={"Tumor Volume (mm3)_x" : "Variance", "Tumor Volume (mm3)_y" : "Standard Deviation"})

#Taking the SEM of tumor volume 
tumor_sem = regimen_df.sem()

#Merging into one summary table 
sum_data_df = pd.merge(sum_data_df, tumor_sem, on='Drug Regimen', how='outer')
sum_data_df

#Final Summary Table 
#final_sum_data_df = sum_data_df.reset_index()
#final_sum_data_df


Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,Tumor Volume (mm3)
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,42.212714,45.0,26.464559,5.144372,1.028874
Ceftamin,52.769195,50.70128,52.760513,7.263643,1.452729
Infubinol,51.243068,50.336553,33.8533,5.818359,1.163672
Ketapril,54.302915,50.659581,88.146614,9.388643,1.877729
Naftisol,52.12872,50.002314,56.703087,7.530145,1.506029
Placebo,51.010353,49.000125,42.91105,6.550653,1.310131
Propriva,50.549618,48.710661,38.379616,6.195128,1.239026
Ramicane,41.031254,41.714619,18.414583,4.291222,0.858244
Stelasyn,52.49355,51.522841,50.882939,7.133228,1.456064
Zoniferol,52.7291,49.934725,61.644001,7.851369,1.570274


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
mice_per_df = clean_data_df.groupby('Drug Regimen')
number_of_mice_df = mice_per_df.count()
number_of_mice_df = number_of_mice_df['Mouse ID']

#Merging count with sum data and eliminating the index
mice_df = pd.merge(sum_data_df, number_of_mice_df, on = 'Drug Regimen') 
mice_df = mice_df.reset_index()

mice_df = mice_df.rename(columns={'Mouse ID':'Number of Mice'})

x_values = mice_df['Drug Regimen']

y_values = mice_df['Number of Mice']

#Making Bar Plot 
mice_df.plot(x = 'Drug Regimen', y = 'Number of Mice', kind='bar', title= 'Mice per Treatment')


In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
x_axis = np.arange(0, len(x_values))

tick_locations = []

for x in x_axis:
    tick_locations.append(x)

y_axis = list(y_values)

plt.title('Rat Count Per Drug')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Rats')

plt.bar(x_axis, y_axis, facecolor = 'indigo', alpha= 0.25, align= 'center')
plt.xticks(tick_locations, x_values, rotation = 'vertical')
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_group_df = mice_data_df.groupby('Sex')
sex_group_count = sex_group_df.count()
sex_group_count
#sex_group_df = sex_group_df.reset_index()
#sex_group_count.plot(x = 'Sex', y = 'Mouse ID', kind='pie')


In [1]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex_count = sex_group_df['Mouse ID']
gender = sex_group_df['Sex']
color = ['purple','yellow']
explode = [0.1,0]

plt.pie(sex_count, explode=explode, labels=gender, colors=color, autopct = '%1.1f%%')

plt.axis('equal')

NameError: name 'sex_group_df' is not defined

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capomulin_df = mice_data_df.loc[(mice_data_df['Drug Regimen'] == 'Capomulin')]
b128_df = capomulin_df.loc[(capomulin_df['Mouse ID'] == 'b128')]
time = np.arange(0, 50, 5)

#Finding the x and y values 
timepoint = b128_df[['Timepoint']]
tumor_volume = b128_df[['Tumor Volume (mm3)']]

#Making the scatter plot
tumor_volume, = plt.plot(time, tumor_volume, marker="o", color = 'orange', label = 'Tumor Volume (mm3)')

#Adding lables and titles 
plt.title('Tumor Volume Over Time')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')

#Setting Limits
plt.xlim(0,45)
plt.ylim(0,70)

#Setting a legend 
plt.legend(handles=[tumor_volume], loc='best')

#Adding a grid
plt.grid()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
#Finding the average of the tumor volume over time 
mouse_id_df = capomulin_df.groupby('Mouse ID')
average_tumor_df = mouse_id_df.mean()

#Finding the x and y variables 
average_tumor = average_tumor_df['Tumor Volume (mm3)']
weight = average_tumor_df['Weight (g)']

#Making the scatter plot
plt.scatter(weight, average_tumor, marker = 'o', facecolors = 'purple', edgecolors = 'yellow')

#Giving names to labels
plt.title('Weight v Tumor Size')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')

#Setting Limits
plt.xlim(10,28)
plt.ylim(0,50)
plt.grid()

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
(slope, intercept,  rvalue, pvalue, stderr) = sts.linregress(weight, average_tumor)
lin_reg = ('y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2)))
lin_reg

In [None]:
correlation = np.corrcoef(average_tumor, weight, rowvar=True)
correlation_coefficient = correlation[0][1]
correlation_coefficient 

In [None]:
#Making the scatter plot
plt.scatter(weight, average_tumor, marker = 'o', facecolors = 'purple', edgecolors = 'yellow')

#Giving names to labels
plt.title('Weight v Tumor Size')
plt.xlabel('Weight (g)')
plt.ylabel('Tumor Volume (mm3)')

#Setting Limits
plt.xlim(10,28)
plt.ylim(0,50)

x_values = weight
regress_values = x_values * slope + intercept 
plt.plot(x_values, regress_values, 'r-')
plt.grid()