## Observations and Insights

## Dependencies and starter code

In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from functools import reduce
# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset

mouse_study_df = pd.DataFrame.merge(mouse_metadata,study_results,how = 'right',on = 'Mouse ID',)
mouse_study_df.head(3)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1


In [2]:
#make columns workable
mouse_study_df.columns = mouse_study_df.columns.str.strip().str.replace('(', '').str.replace(')', '')
mouse_study_df.head(3)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight g,Timepoint,Tumor Volume mm3,Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1


## Summary statistics

In [18]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen

tumor_avg=mouse_study_df.groupby('Drug Regimen').mean()
mean=tumor_avg['Tumor Volume mm3']
mean_df=pd.DataFrame(mean)
mean_df.rename(columns={'Tumor Volume mm3':'Tumor Volume Mean'},inplace=True)
mean_df.head(1)

Unnamed: 0_level_0,Tumor Volume Mean
Drug Regimen,Unnamed: 1_level_1
Capomulin,40.675741


In [15]:
tumor_median=mouse_study_df.groupby('Drug Regimen').median()
median=tumor_median['Tumor Volume mm3']
median_df=pd.DataFrame(median)
median_df.rename(columns={'Tumor Volume mm3':'Tumor Volume Median'},inplace=True)
median_df.head(1)

Unnamed: 0_level_0,Tumor Volume Median
Drug Regimen,Unnamed: 1_level_1
Capomulin,41.557809


In [19]:
tumor_std=mouse_study_df.groupby('Drug Regimen').std()
std=tumor_std["Tumor Volume mm3"]
std_df=pd.DataFrame(std)
std_df.rename(columns={'Tumor Volume mm3':'Tumor Volume STD'},inplace=True)
std_df.head(1)

Unnamed: 0_level_0,Tumor Volume STD
Drug Regimen,Unnamed: 1_level_1
Capomulin,4.994774


In [20]:
tumor_var=mouse_study_df.groupby('Drug Regimen').var()
var=tumor_var["Tumor Volume mm3"]
var_df=pd.DataFrame(var)
var_df.rename(columns={'Tumor Volume mm3':'Tumor Volume Variance'},inplace=True)
var_df.head(1)

Unnamed: 0_level_0,Tumor Volume Variance
Drug Regimen,Unnamed: 1_level_1
Capomulin,24.947764


In [21]:
tumor_sem=mouse_study_df.groupby('Drug Regimen').sem()
sem=tumor_sem['Tumor Volume mm3']
sem_df=pd.DataFrame(sem)
sem_df.rename(columns={'Tumor Volume mm3':'Tumor Volume SEM'},inplace=True)
sem_df.head(1)

Unnamed: 0_level_0,Tumor Volume SEM
Drug Regimen,Unnamed: 1_level_1
Capomulin,0.329346


In [22]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
data_frames=[mean_df,median_df,std_df,var_df,sem_df]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Drug Regimen'],
                                            how='outer'), data_frames)
df_merged

Unnamed: 0_level_0,Tumor Volume Mean,Tumor Volume Median,Tumor Volume STD,Tumor Volume Variance,Tumor Volume SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,4.994774,24.947764,0.329346
Ceftamin,52.591172,51.776157,6.268188,39.290177,0.469821
Infubinol,52.884795,51.820584,6.567243,43.128684,0.492236
Ketapril,55.235638,53.698743,8.279709,68.553577,0.60386
Naftisol,54.331565,52.509285,8.134708,66.173479,0.596466
Placebo,54.033581,52.288934,7.821003,61.168083,0.581331
Propriva,52.322552,50.854632,6.50777,42.35107,0.512884
Ramicane,40.216745,40.673236,4.846308,23.486704,0.320955
Stelasyn,54.233149,52.431737,7.710419,59.450562,0.573111
Zoniferol,53.236507,51.818479,6.966589,48.533355,0.516398


## Bar plots

In [54]:
data_points=mouse_study_df.groupby(["Drug Regimen"])["Drug Regimen"].count().reset_index(name="count")

In [56]:
data_points.plot

AttributeError: module 'pandas' has no attribute 'plot'

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
plt.bar(x_axis, users, color='r', alpha=0.5, align="center")

# Tell matplotlib where we would like to place each of our x axis headers
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ["Java", "C++", "Python", "Ruby", "Clojure"])

# Sets the x limits of the current chart
plt.xlim(-0.75, len(x_axis)-0.25)

# Sets the y limits of the current chart
plt.ylim(0, max(users)+5000)

# Give our chart some labels and a tile
plt.title("Popularity of Programming Languages")
plt.xlabel("Programming Language")
plt.ylabel("Number of People Using Programming Languages")

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
plt.bar(x_axis, users, color='r', alpha=0.5, align="center")

# Tell matplotlib where we would like to place each of our x axis headers
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ["Java", "C++", "Python", "Ruby", "Clojure"])

# Sets the x limits of the current chart
plt.xlim(-0.75, len(x_axis)-0.25)

# Sets the y limits of the current chart
plt.ylim(0, max(users)+5000)

# Give our chart some labels and a tile
plt.title("Popularity of Programming Languages")
plt.xlabel("Programming Language")
plt.ylabel("Number of People Using Programming Languages")

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen