## Observations and Insights 

In [12]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "../Resources/Mouse_metadata.csv"
study_results_path = "../Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path, encoding="ISO-8859-1")
study_results = pd.read_csv(study_results_path, encoding="ISO-8859-1")

# Combine the data into a single dataset
merge_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")
merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [18]:
# Checking the number of mice.
#df.count()
len(merge_df["Mouse ID"].unique())

249

In [17]:
merge_df["Mouse ID"]

0       k403
1       k403
2       k403
3       k403
4       k403
        ... 
1888    z969
1889    z969
1890    z969
1891    z969
1892    z969
Name: Mouse ID, Length: 1893, dtype: object

In [26]:
# Use df.loc to narrow data
# new_df = df.loc[:,["","",""]]
narrow_mice = merge_df.loc[:,["Mouse ID","Timepoint"]]
narrow_mice

Unnamed: 0,Mouse ID,Timepoint
0,k403,0
1,k403,5
2,k403,10
3,k403,15
4,k403,20
...,...,...
1888,z969,25
1889,z969,30
1890,z969,35
1891,z969,40


In [37]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
# duplicateRowsDF = dfObj[dfObj.duplicated(['Age'])]
duplicate_mice = narrow_mice[narrow_mice.duplicated(["Mouse ID"])]
duplicate_mice.count()["Mouse ID"]

1644

In [39]:
# Display the data table for preview
duplicate_mice

Unnamed: 0,Mouse ID,Timepoint
1,k403,5
2,k403,10
3,k403,15
4,k403,20
5,k403,25
...,...,...
1888,z969,25
1889,z969,30
1890,z969,35
1891,z969,40


In [61]:
print(duplicate_mice)

     Mouse ID  Timepoint
1        k403          5
2        k403         10
3        k403         15
4        k403         20
5        k403         25
...       ...        ...
1888     z969         25
1889     z969         30
1890     z969         35
1891     z969         40
1892     z969         45

[1644 rows x 2 columns]


In [81]:
# Optional: Get all the data for the duplicate mouse ID. 

In [82]:
# Find the duplicate mouse
# df = duplicate_mice.duplicated(subset=["Mouse ID"], "Mouse ID")
# use the .loc method you can locate the Mouse ID column
# check with .unique method to check your duplicated row has dropped from original count
problem_mouse = duplicate_mice.loc[duplicate_mice.duplicated(subset=['Mouse ID', 'Timepoint']), 'Mouse ID'].unique()
problem_mouse

array(['g989'], dtype=object)

In [93]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Drop all rows with missing information
clean_df = duplicate_mice.drop(problem_mouse , inplace=True)
clean_df = clean_df.dropna(how='any')

clean_df


KeyError: "['g989'] not found in axis"

In [None]:
###duplicate_mice[duplicate_mice["Mouse ID"].duplicated(keep=False)].sort_values("Mouse ID")
###df[df.ID.duplicated(keep=False)].sort_values("ID")

# Get names of indexes for which column Age has value 30
#indexNames = dfObj[ dfObj['Age'] == 30 ].index
# Delete these row indexes from dataFrame
#dfObj.drop(indexNames , inplace=True)

df = df.drop(df[df.score < 50].index)

In place version (as pointed out in comments)

clean_df = duplicate_mice.drop(df[df.score < 50].index, inplace=True)

In [None]:
# Checking the number of mice in the clean DataFrame.
#df.count()


# Verify clean up
#num_players = clean_df.count()[0]
#num_players
#clean_mice = clean_df.count()[0]
#clean_mice

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.



In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

best_treatments = clean_df.loc[(clean_df["Drug Regimen"] == "Capomulin") |
                               (clean_df["Drug Regimen"] == "RamiCane")|
                               (clean_df["Drug Regimen"] == "Infubinol")|
                               (clean_df["Drug Regimen"] == "Ceftmain")]

# Start by getting the last (greatest) timepoint for each mouse
mouse_group = best_treatments.groupby("Mouse ID")
mouse_df = pd.DataFrame ({
    "Last Timepoint": mouse_group["Timepoint"].max()
})
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_volume = pd.merge(mouse_df, clean_df, left_on=["Mouse ID, "Last Timepoint"], right_on=["Mouse ID", ""]
final_volume = final_volume[["Mouse ID", "Last Timepoint", "Drug Regimen", "Tumor Volume (mm3)"]]
final_volume

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data = [0,0,0,0]

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
quartiles = tumor_volume.quantile([.25,.50.75])

lowerq = quartiles[.25]
upperq = quartiles[.75]
iqr = upperq-lowerq
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
for treatment in treatments:
    
    subset_df = final_volume.loc[final_volume["Drug Regimen"] == treatment, "Tumor Volume (mm3)"]
    
    # add subset 

tumor_vol_data[0] = tumor_vol_data[0]["Tumor Volume (mm3)"]
tumor_vol_data[1] = tumor_vol_data[0]["Tumor Volume (mm3)"]
tumor_vol_data[2] = tumor_vol_data[0]["Tumor Volume (mm3)"]
tumor_vol_data[3] = tumor_vol_data[0]["Tumor Volume (mm3)"]

subset_df = pd.DataFRame({
    "Drug Regimen" : treatments, 
    "Total Volume" : tumor_vol_data
    })
    
    # Determine outliers using upper and lower bounds
lower_bound = lowerq - (1.5 * iqr)
upper_bound = upperq + (1.5 *iqr)
subset_df
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
figl, axl = plt.subplots()
axl.set.title("Final Tumor Volumes")
axl.set_ylabel("Drug Regimen")
axl.boxplot(final_volume("Tumor_Volume (mm3)"])
            


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
