# Pymaceuticals Inc.
---

### Analysis

- This analysis was to compare the effects different cancer drugs had on the tumors on mice in terms of the tumor volume and weight. The drug Capomulin was to be specifically highlighted to indentify if there was a positive correlation between the use of Capomulin in mice and the effects on the reduction of the tumor volume and weight. After merging the two data sets given and cleaning the data to remove a duplicate mouse. On the data visualizations Capomulin has a positive correalation on the use of the drug to the reduction of tumor volume and weight. The only other drug to compare was Ketapril.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
df = pd.merge(study_results, mouse_metadata, on="Mouse ID", how="left")

# Display the data table for preview
df.head()

In [None]:
print(mouse_metadata.shape)
mouse_metadata.head()

In [None]:
print(study_results.shape)
study_results.head()

In [None]:
print(df.shape)
df.info()

In [None]:
# Checking the number of mice.
df["Mouse ID"].nunique()

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
df.groupby(["Mouse ID", "Timepoint"]).size().sort_values(ascending=False).head(10)

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
mask = df["Mouse ID"] == "g989"
df.loc[mask]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mask = df["Mouse ID"] != "g989"
df2 = df.loc[mask].reset_index(drop=True)
df2.shape

In [None]:
# Checking the number of mice in the clean DataFrame.
df2["Mouse ID"].nunique()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

tumor_mean = df2.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].mean()
tumor_median = df2.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].median()
tumor_sem = df2.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].sem()
tumor_std = df2.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].std()
tumor_var = df2.groupby(["Drug Regimen"])["Tumor Volume (mm3)"].var()

data = {
    "Tumor Average": tumor_mean,
    "Tumor Median": tumor_median,
    "Tumor SEM": tumor_sem,
    "Tumor STD": tumor_std,
    "Tumor Var": tumor_var
}

leaderboard = pd.DataFrame(data).reset_index()
leaderboard

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
cols_agg = {
    "Tumor Volume (mm3)": ["mean", "median", "var", "std", "sem"]
}
leaderboard2 = df2.groupby(["Drug Regimen"]).agg(cols_agg).reset_index()
leaderboard2

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
df2["Drug Regimen"].value_counts().sort_values().plot(kind="barh", title="Drug Regimen Counts", color="limegreen")
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
data = df2["Drug Regimen"].value_counts()
x = data.index.values
y = data.values


# make the plot
# df2["Drug Regimen"].value_counts().sort_values().plot(kind="barh", title="Drug Regimen Counts", color="limegreen")
plt.bar(x, y)

plt.xlabel("Drug Regimen")
plt.ylabel("Mouse Timepoint")
plt.xticks(rotation=90)
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
data = df2["Sex"].value_counts()
colors = ['purple', 'gold']
data.plot(kind = "pie", autopct='%1.1f%%', colors=colors)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
data = df2["Sex"].value_counts()
colors = ['green', 'blue']
plt.pie(data, autopct='%1.1f%%', labels=data.index.values, colors=colors)
plt.ylabel("Sex")

## Quartiles, Outliers and Boxplots

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(data=df2, x="Drug Regimen", y="Tumor Volume (mm3)")
plt.xticks(rotation=90)

In [None]:
sns.violinplot(data=df2, x="Drug Regimen", y="Tumor Volume (mm3)", hue="Sex", split=True)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint


In [None]:
df3 = df2.groupby("Mouse ID").Timepoint.max().reset_index()
sub = df2.loc[:,["Mouse ID", "Timepoint", "Drug Regimen", "Tumor Volume (mm3)","Sex"]] # subsetting columns

df3 = pd.merge(df3, sub, on=["Mouse ID", "Timepoint"])

mask = df3["Drug Regimen"].isin(["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])
df3 = df3.loc[mask].reset_index(drop=True) # dropped rows

df3

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes

    
    # add subset 

    
    # Determine outliers using upper and lower bounds


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
sns.boxplot(data=df3, x="Drug Regimen", y="Tumor Volume (mm3)")
plt.title("Final Tumor Volume per Drug")
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
df2.loc[df2["Drug Regimen"] == "Capomulin"]

mouse = "s185"

data = df2.loc[df2["Mouse ID"] == mouse]

# Step 1: Obtain data
# x axis, 
time = data["Timepoint"]
y = data["Tumor Volume (mm3)"]

# Step 3:
plt.plot(time, y, color="firebrick", linewidth=3, marker="o", markersize=10)

# Step 4:
plt.xlabel("Time (days)", fontsize=12, fontstyle="italic")
plt.ylabel("Tumor Volume (mm3)", fontsize=12, fontstyle="italic")
plt.title(f"Tumor Volume vs Time for {mouse}", fontsize=16, fontweight="bold")

plt.grid(color="lightgrey", linestyle="--", alpha=0.5)

# Step 5: Display graph
plt.show

In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
data = df2.loc[df2["Drug Regimen"] == "Capomulin"]

df4 = data.groupby("Mouse ID").agg({"Weight (g)": "mean", "Tumor Volume (mm3)": "mean"}).reset_index()

x = df4["Weight (g)"]
y = df4["Tumor Volume (mm3)"]

plt.figure(figsize=(10, 6))

plt.scatter(x, y, facecolor="limegreen", edgecolor="purple", s=250)

# Customizations
plt.xlabel("Weight (g)", fontsize=12, fontstyle="italic")
plt.ylabel("Tumor Volume (mm3)", fontsize=12, fontstyle="italic")
plt.title("Weight vs Tumor for Capomulin", fontweight="bold", fontsize=16)

# plt.xlim(10, 27)
# plt.ylim(0, 250)

plt.grid(color="lightgrey", linestyle = "--", alpha=0.5)

plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen


from scipy.stats import linregress
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



In [None]:
corrs = df4.corr(numeric_only = True)

corrs

In [None]:
.84 ** 2

In [None]:
# Using Linregress
x_values = df4["Weight (g)"]
y_values = df4["Tumor Volume (mm3)"]

# make the line of best fit
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

# Make canvas
plt.figure(figsize=(10, 6))

# Make the basic plot
plt.scatter(x_values, y_values, facecolor="magenta", edgecolor="black", s=100)
plt.plot(x_values, regress_values, "r-") 

# Customizations
plt.xlabel("Weight (g)", fontsize=12, fontstyle="italic")
plt.ylabel("Tumor Volume (mm3)", fontsize=12, fontstyle="italic")
plt.title("Weight vs Tumor for Capomulin", fontweight="bold", fontsize=16)
plt.grid(color="lightgrey", linestyle = "--", alpha=0.5)

plt.annotate(line_eq,(22, 36),fontsize=15,color="magenta")

print(f"The r-squared is: {rvalue**2}")
plt.show()





