## Observations and Insights 

### 1.	There is a positive correlation between the mouse weight and the average tumor volume.

### 2.	The Drug Regimen “Capomulin” has the maximum mice number followed by the drug Regimen “Ramicane”.  The Drug Regimen “Propriva” has the smallest number of mice. There is a total number of 249 mice. 


### 3.	There are more male mice than female. There is 51% of male mice and there is 49% of female mice. 


In [127]:
%matplotlib notebook

In [128]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [129]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined = pd.merge(mouse_metadata, study_results, on=('Mouse ID'))

# Display the data table for preview
combined.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [130]:
# Checking the number of mice.
mice = len(combined['Mouse ID'].value_counts())
print(mice)

249


In [131]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = combined.loc[combined.duplicated(subset=['Mouse ID', 'Timepoint',]),'Mouse ID'].unique()
print(duplicate_mice)

['g989']


In [132]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_data = combined.loc[combined['Mouse ID'] == 'g989']
duplicate_mice_data.head(20)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [133]:
combined.rename(columns = {'Mouse ID':'Mouse_ID'}, inplace = True)

In [134]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean = combined[combined.Mouse_ID != 'g989']
clean.head()

Unnamed: 0,Mouse_ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [135]:
clean.to_csv("clean.csv")

In [136]:
# Checking the number of mice in the clean DataFrame.
mice_number = len(clean['Mouse_ID'].value_counts())
print(mice_number)

248


## Summary Statistics

In [137]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

mean = clean.groupby(['Drug Regimen']).mean()['Tumor Volume (mm3)']
mean_df = mean.to_frame()
mean_df.rename(columns = {'Tumor Volume (mm3)':'Mean'}, inplace = True)


median = clean.groupby(['Drug Regimen']).median()['Tumor Volume (mm3)']
median_df = median.to_frame()
median_df.rename(columns = {'Tumor Volume (mm3)':'Median'}, inplace = True)


variance = clean.groupby(['Drug Regimen']).var()['Tumor Volume (mm3)']
variance_df = variance.to_frame()
variance_df.rename(columns = {'Tumor Volume (mm3)':'Variance'}, inplace = True)

standard_deviation = clean.groupby(['Drug Regimen']).std()['Tumor Volume (mm3)']
standard_deviation_df = standard_deviation.to_frame()
standard_deviation_df.rename(columns = {'Tumor Volume (mm3)':'Standard Deviation'}, inplace = True)

SEM = clean.groupby(['Drug Regimen']).sem()['Tumor Volume (mm3)']
SEM_df = SEM.to_frame()
SEM_df.rename(columns = {'Tumor Volume (mm3)':'SEM'}, inplace = True)

# Merging DataFrames
merge1 = pd.merge(mean_df, median_df, on="Drug Regimen")
merge2 = pd.merge(merge1, variance_df, on="Drug Regimen")
merge3 = pd.merge(merge2, standard_deviation_df, on="Drug Regimen")
summary_table = pd.merge(merge3, SEM_df, on="Drug Regimen")
summary_table.head()

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466


In [138]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

clean.groupby('Drug Regimen').aggregate(['mean', 'median', 'var', 'std', 'sem'])['Tumor Volume (mm3)']

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [139]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.

# Grouping the data by DataFrame by Drug Regimen and calculating the total number of mice using the count function
# Creating a DataFrame using the results
# Using DataFrame.plot() in order to create a bar chart of the data
# Setting a title for the chart
# Setting labels
mice_drug_regimen = clean.groupby("Drug Regimen").count()["Mouse_ID"]
mice_drug_regimen.plot(kind="bar", figsize = (10,5))
plt.title("Total number of Mice per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.plt.show()

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Number of Mice')

In [140]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.

# Renaming the Drug Regimen column in order to use it later in the "unique" function
# Getting the unique values in the Drug Regimen column
# Converting the data into a list
# Printing the list
clean.rename(columns = {'Drug Regimen':'Drug_Regimen'}, inplace = True)
drug_regimen = clean.Drug_Regimen.unique()
drug_regimen_list = drug_regimen.tolist()
drug_regimen_list

mice_drug_regimen_list = mice_drug_regimen.tolist()
mice_drug_regimen_list
fig1, ax1 = plt.subplots(figsize=(10, 5))
x_axis = np.arange(len(mice_drug_regimen))
plt.bar(x_axis, mice_drug_regimen_list, color='b', alpha=0.8, align='center')
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, drug_regimen_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


<IPython.core.display.Javascript object>

([<matplotlib.axis.XTick at 0x2ed005fcd60>,
  <matplotlib.axis.XTick at 0x2ed005fcd30>,
  <matplotlib.axis.XTick at 0x2ed005d1940>,
  <matplotlib.axis.XTick at 0x2ed01da1f70>,
  <matplotlib.axis.XTick at 0x2ed00fc44c0>,
  <matplotlib.axis.XTick at 0x2ed00fc49d0>,
  <matplotlib.axis.XTick at 0x2ed00fc4ee0>,
  <matplotlib.axis.XTick at 0x2ed00fca430>,
  <matplotlib.axis.XTick at 0x2ed00fc41f0>,
  <matplotlib.axis.XTick at 0x2ed00fca5e0>],
 [Text(0, 0, 'Ramicane'),
  Text(1, 0, 'Capomulin'),
  Text(2, 0, 'Infubinol'),
  Text(3, 0, 'Placebo'),
  Text(4, 0, 'Ceftamin'),
  Text(5, 0, 'Stelasyn'),
  Text(6, 0, 'Zoniferol'),
  Text(7, 0, 'Ketapril'),
  Text(8, 0, 'Propriva'),
  Text(9, 0, 'Naftisol')])

In [141]:
gender = clean.Sex.unique()
gender_list = gender.tolist()
gender_list

['Male', 'Female']

In [142]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

# Calculating the total number of mice
# Calculating the total number of mice by gender
# Calculating the percentage of the total number of mice by gender
# Putting results into a DataFrame
# Renaming the "Mouse_ID" column

total_mice = clean['Mouse_ID'].count()
mice_sex = clean.groupby("Sex").count()["Mouse_ID"]
mice_sex_per = (mice_sex / total_mice) * 100
mice_sex_per = mice_sex_per.to_frame()
mice_sex_per.rename(columns = {'Mouse_ID':'Total_Mice_Gender %'}, inplace = True)
mice_sex_per.head()

# Creating Pie chart 
colors = ['yellow','green']
explode = (0.1,0)
plot = mice_sex_per.plot.pie(y='Total_Mice_Gender %',figsize=(10,5), colors = colors, startangle=140, explode = explode, shadow = True, autopct="%1.1f%%")
plt.axis("equal")
plt.title("Distribution of Female vs Male mice")
plt.show()

<IPython.core.display.Javascript object>

In [143]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Getting the unique values in the "Sex" column
# Converting the results into a list to be used when charting
gender = clean.Sex.unique()
gender_list = gender.tolist()
gender_list


# Calculating the total number of mice
# Calculating the total number of mice by gender
# Calculating the percentage of the total number of mice by gender
# Putting results into a DataFrame
# Renaming the "Mouse_ID" column
# Converting the new column to a list to be used when charting
total_mice = clean['Mouse_ID'].count()
mice_sex = clean.groupby("Sex").count()["Mouse_ID"]
mice_sex_per = (mice_sex / total_mice) * 100
mice_sex_per2 = mice_sex_per.to_frame()
mice_sex_per2.rename(columns = {'Mouse_ID':'Total_Mice_Gender'}, inplace = True)
mice_sex_list = mice_sex_per2['Total_Mice_Gender'].tolist()

# Telling matplotlib to create a pie chart based upon the above data
colors = ['yellow','green']
explode = (0.1,0)
fig1, ax1 = plt.subplots(figsize=(10, 5))
plt.pie(mice_sex_per, explode=explode, labels = gender_list, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
plt.axis("equal")
plt.title("Distribution of Female vs Male mice")
plt.show()

<IPython.core.display.Javascript object>

## Quartiles, Outliers and Boxplots

In [144]:
clean.head()

Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [145]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

# Finding the greatest value in the Timepoint column using the max function

Capomulin = clean.loc[clean["Drug_Regimen"] == "Capomulin",:]
Ramicane = clean.loc[clean["Drug_Regimen"] == "Ramicane", :]
Infubinol = clean.loc[clean["Drug_Regimen"] == "Infubinol", :]
Ceftamin = clean.loc[clean["Drug_Regimen"] == "Ceftamin", :]

# Capomulin
final_Capomulin = Capomulin.groupby('Mouse_ID').max()['Timepoint']
Capomulin_volume = pd.DataFrame(final_Capomulin)
Capomulin_merged = pd.merge(Capomulin_volume, clean, on=("Mouse_ID","Timepoint"),how="left")
Capomulin_merged.head()

Unnamed: 0,Mouse_ID,Timepoint,Drug_Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,b128,45,Capomulin,Female,9,22,38.982878,2
1,b742,45,Capomulin,Male,7,21,38.939633,0
2,f966,20,Capomulin,Male,16,17,30.485985,0
3,g288,45,Capomulin,Male,3,19,37.074024,1
4,g316,45,Capomulin,Female,22,22,40.15922,2


In [146]:
# Capomulin quartiles and IQR
# Determining outliers using upper and lower bounds
Capomulin_tum = Capomulin_merged["Tumor Volume (mm3)"]

quartiles = Capomulin_tum.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


print(f"The lower quartile of Capomulin tumors {lowerq}")
print(f"The upper quartile of Capomulin tumors {upperq}")
print(f"The interquartile range of Capomulin tumors {iqr}")
print(f"The median of Capomulin tumors: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

The lower quartile of Capomulin tumors 32.37735684
The upper quartile of Capomulin tumors 40.1592203
The interquartile range of Capomulin tumors 7.781863460000004
The median of Capomulin tumors: 38.125164399999996 
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [147]:
# Ramicane
final_Ramicane = Ramicane.groupby('Mouse_ID').max()['Timepoint']
Ramicane_volume = pd.DataFrame(final_Ramicane)
Ramicane_merged = pd.merge(Ramicane_volume, clean, on=("Mouse_ID","Timepoint"),how="left")
Ramicane_merged.head()

Unnamed: 0,Mouse_ID,Timepoint,Drug_Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,a411,45,Ramicane,Male,3,22,38.407618,1
1,a444,45,Ramicane,Female,10,25,43.047543,0
2,a520,45,Ramicane,Male,13,21,38.810366,1
3,a644,45,Ramicane,Female,7,17,32.978522,1
4,c458,30,Ramicane,Female,23,20,38.342008,2


In [148]:
# Ramicane quartiles and IQR
# Determining outliers using upper and lower bounds
Ramicane_tum = Ramicane_merged["Tumor Volume (mm3)"]

quartiles = Ramicane_tum.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


print(f"The lower quartile of Ramicane tumors is: {lowerq}")
print(f"The upper quartile of Ramicane tumors is: {upperq}")
print(f"The interquartile range of Ramicane tumors is: {iqr}")
print(f"The median of Ramicane tumors is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

The lower quartile of Ramicane tumors is: 31.56046955
The upper quartile of Ramicane tumors is: 40.65900627
The interquartile range of Ramicane tumors is: 9.098536719999998
The median of Ramicane tumors is: 36.56165229 
Values below 17.912664470000003 could be outliers.
Values above 54.30681135 could be outliers.


In [149]:
# Infubinol quartiles and IQR
# Determining outliers using upper and lower bounds
final_Infubinol = Infubinol.groupby('Mouse_ID').max()['Timepoint']
Infubinol_volume = pd.DataFrame(final_Infubinol)
Infubinol_merged = pd.merge(Infubinol_volume, clean, on=("Mouse_ID","Timepoint"),how="left")
Infubinol_merged.head()

Unnamed: 0,Mouse_ID,Timepoint,Drug_Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,a203,45,Infubinol,Female,20,23,67.973419,2
1,a251,45,Infubinol,Female,21,25,65.525743,1
2,a577,30,Infubinol,Female,6,25,57.031862,2
3,a685,45,Infubinol,Male,8,30,66.083066,3
4,c139,45,Infubinol,Male,11,28,72.226731,2


In [150]:
# Infubinol quartiles and IQR
# Determine outliers using upper and lower bounds
Infubinol_tum = Infubinol_merged["Tumor Volume (mm3)"]

quartiles = Infubinol_tum.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq


print(f"The lower quartile of Infubinol tumors is {lowerq}")
print(f"The upper quartile of Infubinol tumors is {upperq}")
print(f"The interquartile range of Infubinol tumors is {iqr}")
print(f"The median of Infubinol tumors is {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)


print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

The lower quartile of Infubinol tumors is 54.04860769
The upper quartile of Infubinol tumors is 65.52574285
The interquartile range of Infubinol tumors is 11.477135160000003
The median of Infubinol tumors is 60.16518046 
Values below 36.83290494999999 could be outliers.
Values above 82.74144559000001 could be outliers.


In [151]:
# Ceftamin
final_Ceftamin = Ceftamin.groupby('Mouse_ID').max()['Timepoint']
Ceftamin_volume = pd.DataFrame(final_Ceftamin)
Ceftamin_merged = pd.merge(Ceftamin_volume, clean, on=("Mouse_ID","Timepoint"),how="left")
Ceftamin_merged.head()

Unnamed: 0,Mouse_ID,Timepoint,Drug_Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,a275,45,Ceftamin,Female,20,28,62.999356,3
1,b447,0,Ceftamin,Male,2,30,45.0,0
2,b487,25,Ceftamin,Female,6,28,56.057749,1
3,b759,30,Ceftamin,Female,12,25,55.742829,1
4,f436,15,Ceftamin,Female,3,25,48.722078,2


In [152]:
# Ceftamin quartiles and IQR
# Determining outliers using upper and lower bounds

Ceftamin_tum = Ceftamin_merged["Tumor Volume (mm3)"]

quartiles = Ceftamin_tum.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of treatment is {lowerq}")
print(f"The upper quartile of temperatures is {upperq}")
print(f"The interquartile range of temperatures is {iqr}")
print(f"The the median of temperatures is {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

The lower quartile of treatment is 48.72207785
The upper quartile of temperatures is 64.29983003
The interquartile range of temperatures is 15.577752179999997
The the median of temperatures is 59.85195552 
Values below 25.355449580000002 could be outliers.
Values above 87.66645829999999 could be outliers.


In [153]:
# Generating a box plot of the final tumor volume of each mouse across four regimens of interest
box_plot = [Capomulin_tum, Ramicane_tum, Infubinol_tum, Ceftamin_tum]
Regimen= ['Capomulin', 'Ramicane', 'Infubinol','Ceftamin']

fig1, ax1 = plt.subplots(figsize=(15, 10))
ax1.set_title('Final Tumor Volume',fontsize = 25)
ax1.set_ylabel('Final Tumor Volume (mm3)',fontsize = 14)
ax1.set_xlabel('Drug_Regimen',fontsize = 14)
ax1.boxplot(box_plot, labels=Regimen, widths = 0.4, patch_artist=True,vert=True)

plt.ylim(10, 80)

plt.show()

<IPython.core.display.Javascript object>

In [154]:
# Saving image
plt.savefig("Images/box_plot.png", bbox_inches = "tight")

## Line and Scatter Plots

In [155]:
# Identifying a Capomulin "Mouse_ID"
Capomulin['Mouse_ID']

10      s185
11      s185
12      s185
13      s185
14      s185
        ... 
440     i557
1452    r157
1453    r157
1454    r157
1455    r157
Name: Mouse_ID, Length: 230, dtype: object

In [156]:
# Generating a line plot of tumor volume vs. time point for a mouse treated with Capomulin
line_df = Capomulin.loc[Capomulin["Mouse_ID"] == "s185",:]
line_df.head()

Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
10,s185,Capomulin,Female,3,17,0,45.0,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0


In [157]:
# Plotting
x_axis = line_df["Timepoint"]
tumor_size = line_df["Tumor Volume (mm3)"]

fig1, ax1 = plt.subplots(figsize=(15, 10))
plt.title('Capomulin treatment of mouse s185',fontsize =25)
plt.plot(x_axis, tumor_size,linewidth=2, markersize=15,marker="o",color="blue", label="Fahreneit")
plt.xlabel('Timepoint (Days)',fontsize =14)
plt.ylabel('Tumor Volume (mm3)',fontsize =14)

plt.show()

<IPython.core.display.Javascript object>

In [158]:
# Saving image
plt.savefig("Images/line_graph.png", bbox_inches = "tight")

In [159]:
# Generating a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

fig1, ax1 = plt.subplots(figsize=(15, 10))
average_vol_capomulin = Capomulin.groupby(['Mouse_ID']).mean()

marker_size=15
plt.scatter(average_vol_capomulin['Weight (g)'],average_vol_capomulin['Tumor Volume (mm3)'],s=175, color="blue")
plt.title('Mouse Weight vs. Average Tumor Volume',fontsize =25)
plt.xlabel('Weight (g)',fontsize =14)
plt.ylabel('Averag Tumor Volume (mm3)',fontsize =14)

plt.show()

<IPython.core.display.Javascript object>

In [160]:
plt.savefig("Images/scatterplot.png", bbox_inches = "tight")

## Correlation and Regression

In [161]:
# Calculatng the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

correlation_coefficient = round(st.pearsonr(average_vol_capomulin['Weight (g)'],average_vol_capomulin['Tumor Volume (mm3)'])[0],2)
print(f"The correlation between mouse weight and average tumor volume is {correlation_coefficient}")

The correlation between mouse weight and average tumor volume is 0.84


In [162]:
# Plotting the linear regression model on top of the previous scatter plot.

x_values = average_vol_capomulin['Weight (g)']
y_values = average_vol_capomulin['Tumor Volume (mm3)']

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regression_values = x_values * slope + intercept

print(f"Slope:{slope}")
print(f"Intercept:{intercept}")
print(f"rvalue (Correlation coefficient) is {rvalue}")
print(f"pandas (Correlation coefficient)is {correlation_coefficient}")
print(f"stderr is {stderr}")

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(line_eq)

Slope:0.9544396890241045
Intercept:21.552160532685015
rvalue (Correlation coefficient) is 0.8419363424694718
pandas (Correlation coefficient)is 0.84
stderr is 0.12754359033201337
y = 0.95x + 21.55


In [163]:
# Adding the linear regression equation and line to plot

fig1, ax1 = plt.subplots(figsize= (15, 10))
plt.scatter(x_values, y_values, s = 175, color="blue")
plt.plot(x_values, regression_values, "r-")
plt.title('Regression Plot of Mouse Weight vs. Average Tumor Volume',fontsize =20)
plt.xlabel('Weight(g)',fontsize =14)
plt.ylabel('Average Tumor Volume (mm3)',fontsize =14)
ax1.annotate(line_eq, xy= (20, 40), xycoords ='data', xytext = (0.8, 0.95), textcoords = 'axes fraction', horizontalalignment = 'right', verticalalignment = 'top',fontsize = 30, color = "red")

print(f"The r-squared is: {rvalue**2}")

plt.show()

<IPython.core.display.Javascript object>

The r-squared is: 0.7088568047708717


In [164]:
# Saving image
plt.savefig("Images/linear_regression.png", bbox_inches = "tight")