## Pymaceuticals Inc.

### Drugs vs tumor

In [None]:
# Dependencies
import pandas as pd
import scipy.stats as scs
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Set up the path to files with data
mouse_data_file = 'Resources/Mouse_metadata.csv'
study_result_data_file = 'Resources/Study_results.csv'

In [None]:
# Read mouse_metadata.csv file to DataFrame
mouse_original_df = pd.read_csv(mouse_data_file)
mouse_original_df.head()


In [None]:
# Read study_result.csv to DataFrame
study_res_original_df = pd.read_csv(study_result_data_file)
study_res_original_df.head()

In [None]:
# Merge both df together. Inner join because we need anly mouse that included in study_result file
summary_df = study_res_original_df.merge(mouse_original_df, on='Mouse ID', how='inner')
summary_df

In [None]:
# Find duplicated mouses 
duplicated_row_df = summary_df[summary_df.duplicated(['Mouse ID', 'Timepoint'])]
duplicated_row_df

In [None]:
# Get mouse id from duplicated df
mouse_id = duplicated_row_df['Mouse ID'].unique()
mouse_id = mouse_id[0]

# delete all rows with given mouse id
summary_df.drop(summary_df[summary_df['Mouse ID'] == mouse_id].index, inplace=True)
len(summary_df['Mouse ID'].unique())

#### Summary Statistics

Generate a summary statistics table consisting of the mean, median, variance, standard deviation, and SEM of the tumor volume for each drug regimen.

In [None]:
# groupby by drug regime
regime = summary_df.groupby('Drug Regimen')

# generate mean 
mean_tumor_volume = regime['Tumor Volume (mm3)'].mean()

# generate median
median_tumor_volume = regime['Tumor Volume (mm3)'].median()

drugs = summary_df['Drug Regimen'].unique()
drugs

# variable to store variance, standart deviation and sem for each drug
variance = []
st_dv = []
std_err = []

for drug in drugs:
    certain_drug = summary_df[summary_df['Drug Regimen'] == drug]
    #tumor_volume = certain_drug[]
    variance.append(scs.tvar(certain_drug['Tumor Volume (mm3)']))
    st_dv.append(scs.tstd(certain_drug['Tumor Volume (mm3)']))
    std_err.append(scs.sem(certain_drug['Tumor Volume (mm3)']))

    
# Create a summary df
base_statistics_df = pd.DataFrame({"Mean Tumor Volume": mean_tumor_volume
                                  , "Median Tumor Volume": median_tumor_volume
                                  , "Tumor Volume Variance": variance
                                  , 'Tumor Volum Std. Dev.': st_dv
                                  , "Tumor Volum Std. Err": std_err})
base_statistics_df

In [None]:
# Bar chart via pandas.plot
mouse_per_drugs = []

for drug in drugs:
    mouse_per_drugs.append(summary_df[summary_df['Drug Regimen'] == drug]['Mouse ID'].count())

    
mouse_per_drugs

mouse_vs_drugs_df = pd.DataFrame({'Drugs': drugs, 'Mouse per Drug': mouse_per_drugs}, index=drugs)
mouse_vs_drugs_df= mouse_vs_drugs_df.sort_values(by='Mouse per Drug', ascending=False)
bar_plot = mouse_vs_drugs_df.plot.bar()
bar_plot.set(xlabel='Drug Regimen', ylabel='Number of Unique Mice Tested', title='Mice vs Drug regimen')
plt.show()

In [None]:
# Matplotlib Bar Chart 

#plt.bar(drugs, mouse_per_drugs)
plt.bar(mouse_vs_drugs_df['Drugs'], mouse_vs_drugs_df['Mouse per Drug'])
plt.xticks(rotation=90, horizontalalignment="center")
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Unique Mice Tested')
plt.title('Mice vs Drug regimen')
plt.show()

In [None]:
# Pandas Pie plot mouse gender
pie_plot = summary_df['Sex'].value_counts().plot.pie(y="Sex", autopct='%1.1f%%', startangle=0)
plt.show()

In [None]:
sex = summary_df['Sex'].value_counts()
pie_labels = ['Male', 'Female']
plt.pie(sex, labels=['Male', 'Female'], autopct="%1.1f%%")
plt.title("Chart of Mouse Gender")
#plt.xlabel = 'Sex'
plt.show()

#### Quartiles, Outliers and Boxplots

Calculate the final tumor volume of each mouse across four of the most promising treatment regimens: Capomulin, Ramicane, Infubinol, and Ceftamin. Calculate the quartiles and IQR and quantitatively determine if there are any potential outliers across all four treatment regimens.

In [None]:
# Create a df to calculate final tumor volume
final_tumor_volume_df = summary_df[['Mouse ID', 'Timepoint', 'Tumor Volume (mm3)', 'Drug Regimen']]
final_tumor_volume_df.head()

In [None]:
# leave only data for 4 drug regimen: 'Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'
treatment_regimen = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

final_tumor_volume_df = final_tumor_volume_df.sort_values(['Mouse ID', 'Timepoint'])
final_tumor_volume_df = final_tumor_volume_df[final_tumor_volume_df['Drug Regimen'].isin(treatment_regimen)]
final_tumor_volume_df.head(20)

In [None]:
## Group data by 'Mouse ID' to find max volume of tumor for the last timepoint

max_timepoint_df = final_tumor_volume_df.groupby('Mouse ID')['Timepoint'].max()
max_timepoint_df.reset_index()

max_tumor_volume_df = final_tumor_volume_df.merge(max_timepoint_df, on=['Mouse ID', 'Timepoint'])
max_tumor_volume_df

In [None]:
# find IQR for each treatment


for drug in treatment_regimen:
    
    tumor_volume_by_drug = max_tumor_volume_df[max_tumor_volume_df['Drug Regimen'] == drug]    
    tumor_volume_by_drug = tumor_volume_by_drug['Tumor Volume (mm3)']
    
    quartiles = tumor_volume_by_drug.quantile([.25, .5, .75])
    q1 = quartiles[0.25]
    q3 = quartiles[0.75]
    
    iqr = q3 - q1
    iqr_1_5 = iqr * 1.5
    
    
    q1_bound = q1 - iqr_1_5
    q3_bound = q3 + iqr_1_5
    outliers = [q1_bound, q3_bound]
    
    potential_outliers = []
    for value in tumor_volume_by_drug:
        if value < q1_bound or value > q3_bound:
            potential_outliers.append(value)
    
    #print(potential_outliers)
    print(f"{drug}'s potential outliers: {potential_outliers}")
 
    

In [None]:
flierprops = dict(marker='o', markerfacecolor='red')
max_tumor_volume_df.boxplot(column=['Tumor Volume (mm3)'], by='Drug Regimen', flierprops=flierprops)
plt.title('')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

In [None]:
#Select a mouse that was treated with Capomulin and generate a line plot of tumor volume vs. time point for that mouse.

treated_mouses = summary_df[['Mouse ID', 'Drug Regimen', 'Timepoint', 'Tumor Volume (mm3)']]
treated_mouses = treated_mouses[treated_mouses['Drug Regimen'] == 'Capomulin']
#treated_mouse = treated_mouse.groupby('Mouse ID')[['Timepoint']]
treated_mice = treated_mouses[treated_mouses['Mouse ID'] == 'l509']
treated_mice


In [None]:
plt.plot(treated_mice['Timepoint'], treated_mice['Tumor Volume (mm3)'])
plt.title('Capomulin treatment of mice l509')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volum (mm3)')

In [None]:
#Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin treatment regimen.

mouse_weight = summary_df[['Mouse ID', 'Weight (g)']]
treated_mouses = treated_mouses.merge(mouse_weight, on='Mouse ID')

avg_tumor_volume = treated_mouses.groupby('Mouse ID')['Tumor Volume (mm3)'].mean()
treated_mouses['Weight (g)']
weight = treated_mouses.groupby('Mouse ID')['Weight (g)'].mean()

plt.scatter(weight, avg_tumor_volume)
plt.title('Mouse Weight vs Tumor Volume')
plt.xlabel('Weight (g)')
plt.ylabel('Avg Tumor Volume (mm3)')
plt.show()



In [None]:
# Calculate the correlation coefficient and linear regression model between 
# mouse weight and average tumor volume for the Capomulin treatment. 
#Plot the linear regression model on top of the previous scatter plot.

# correlation coefficient

correlation = scs.pearsonr(weight,avg_tumor_volume)
print(f'The correlation between mouse weight and the average tumor volume is {correlation[0]}')


In [None]:
# Linear Regression
mt_slope, mt_int, mt_r, mt_p, mt_std_err = scs.linregress(weight,avg_tumor_volume)
mt_fit = mt_slope * weight + mt_int

plt.scatter(weight,avg_tumor_volume)
plt.plot(weight,mt_fit,"-", color='red')
plt.title('Linear Regression: Mouse Weight vs Tumor Volume')
plt.xlabel('Weight (g)')
plt.ylabel('Avg Tumor Volume (mm3)')
plt.show()
plt.show()