# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
# Merge the data
merged_data = pd.merge(study_results, mouse_metadata, on="Mouse ID")


# Display the data table for preview
merged_data.head()


In [None]:
# Checking the number of mice.

total_mice_count = merged_data.shape[0]
print(f"Total rows (data points): {total_mice_count}")

In [None]:
# Our data should be uniquely identified by Mouse ID and Timepoint
unique_mice_count = merged_data["Mouse ID"].nunique()
print(f"Number of unique mice: {unique_mice_count}")
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_rows = merged_data[merged_data.duplicated(subset=["Mouse ID", "Timepoint"])]
duplicate_mouse_ids = duplicate_rows["Mouse ID"].unique()

# Check for duplicate Mouse ID and Timepoint combinations
duplicate_rows = merged_data[merged_data.duplicated(subset=["Mouse ID", "Timepoint"], keep=False)]
duplicate_rows



In [4]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

if duplicate_mouse_ids.size > 0:
    cleaned_data = merged_data[~merged_data["Mouse ID"].isin(duplicate_mouse_ids)]
else:
    cleaned_data = merged_data.copy()  # No duplicates found, keep original data




In [None]:
# Checking the number of mice in the clean DataFrame.
updated_unique_mice_count = cleaned_data["Mouse ID"].nunique()
print(f"\nNumber of unique mice after cleaning: {updated_unique_mice_count}")

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
grouped_data = cleaned_data.groupby("Drug Regimen")

# mean, median, variance, standard deviation, and SEM of the tumor volume.
summary_stats = pd.DataFrame({
    "Mean Tumor Volume": grouped_data["Tumor Volume (mm3)"].mean(),
    "Median Tumor Volume": grouped_data["Tumor Volume (mm3)"].median(),
    "Tumor Volume Variance": grouped_data["Tumor Volume (mm3)"].var(),
    "Tumor Volume Std Dev": grouped_data["Tumor Volume (mm3)"].std(),
    "Tumor Volume SEM": grouped_data["Tumor Volume (mm3)"].sem()
})

# Assemble the resulting series into a single summary DataFrame.
print("\nSummary Statistics Table:")
(summary_stats)


In [7]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
# Step 1: Count the number of rows (Mouse ID/Timepoints) for each drug regimen
rows_per_regimen = cleaned_data["Drug Regimen"].value_counts()
# Step 2: Generate the bar plot using Pandas
rows_per_regimen.plot(kind="bar", color="skyblue", figsize=(5, 3))
plt.title("Total Number of Rows (Mouse ID/Timepoints) per Drug Regimen", fontsize=10)
plt.xlabel("Drug Regimen", fontsize=10)
plt.ylabel("Total Rows (Mouse ID/Timepoints)", fontsize=10)
plt.xticks(rotation=45, fontsize=8)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()




In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.

plt.figure(figsize=(5, 3))  # Set figure size
plt.bar(rows_per_regimen.index, rows_per_regimen.values, color="skyblue")
plt.title("Total Number of Rows (Mouse ID/Timepoints) per Drug Regimen", fontsize=10)
plt.xlabel("Drug Regimen", fontsize=12)
plt.ylabel("Total Rows (Mouse ID/Timepoints)", fontsize=10)
plt.xticks(rotation=45, fontsize=12)  # Rotate x-axis labels
plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add gridlines for better readability
plt.tight_layout()  # Adjust layout for better appearance



In [None]:
# Generate a pie chart, using Pandas, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender

unique_mice_gender = cleaned_data.drop_duplicates(subset="Mouse ID")[["Mouse ID", "Sex"]]
#  Count the number of male and female mice
gender_counts = unique_mice_gender["Sex"].value_counts()

# Make the pie chart
gender_counts.plot(kind="pie", autopct="%1.1f%%", startangle=90, colors=["lightblue", "lightpink"], figsize=(3, 3))
# Customize the pie chart
plt.title("Distribution of Unique Female vs. Male Mice in the Study", fontsize=16)
plt.ylabel("")  # Remove the y-axis label for better visualization
plt.tight_layout()
plt.show()


In [None]:
# Generate a pie chart, using pyplot, showing the distribution of unique female versus male mice used in the study

# Get the unique mice with their gender

# Make the pie chart
# Create a figure
plt.figure(figsize=(3, 3))
# Plot the pie chart
plt.pie(gender_counts, 
        labels=gender_counts.index,  # Labels for male and female
        autopct="%1.1f%%",           # Show percentages on slices
        startangle=90,               # Rotate chart to start from the top
        colors=["lightblue", "lightpink"])  # Assign colors to slices
#  Customize the pie chart
plt.title("Distribution of Unique Female vs. Male Mice in the Study", fontsize=16)
#  Show the pie chart
plt.tight_layout()  # Adjust layout for better appearance
plt.show()




## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
final_timepoints = cleaned_data.groupby("Mouse ID")["Timepoint"].max().reset_index()

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
final_tumor_data = final_timepoints.merge(cleaned_data, on=["Mouse ID", "Timepoint"], how="left")

#  Filter the data to include only the four selected treatment regimens
selected_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
final_tumor_data = final_tumor_data[final_tumor_data["Drug Regimen"].isin(selected_regimens)]
#  Display the final dataset for verification
final_tumor_data.head()



In [13]:
# Put treatments into a list for for loop (and later for plot labels)

#selected_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
# Create empty list to fill with tumor vol data (for plotting)
tumor_volumes_list = []
for drug in selected_regimens:
 
 # Locate the rows which contain mice on each drug and get the tumor volumes
 tumor_volumes = final_tumor_data.loc[final_tumor_data["Drug Regimen"] == drug, "Tumor Volume (mm3)"]

 # Add the subset to the list (instead of a dictionary)
 tumor_volumes_list.append(tumor_volumes)

    # Calculate the IQR and quantitatively determine if there are any potential outliers.
 quartiles = tumor_volumes.quantile([0.25, 0.5, 0.75])
 lowerq = quartiles[0.25]
 upperq = quartiles[0.75]
 iqr = upperq - lowerq
     # Determine outliers using upper and lower bounds)
lower_bound = lowerq - (1.5 * iqr)
upper_bound = upperq + (1.5 * iqr)
#  Find any potential outliers
outliers = tumor_volumes[(tumor_volumes < lower_bound) | (tumor_volumes > upper_bound)]






In [None]:
# Generate a box plot that shows the distribution of the tumor volume for each treatment group.
# Create a figure and axis
fig, ax = plt.subplots(figsize=(4, 3))

# Create the box plot
ax.boxplot(tumor_volumes_list, labels=selected_regimens, patch_artist=True,
           flierprops={"marker": "o", "markerfacecolor": "red", "markersize": 10, "linestyle": "none"})

# Add labels and title
ax.set_title("Tumor Volume Distribution by Drug Regimen")
ax.set_ylabel("Final Tumor Volume (mm続)")
ax.set_xlabel("Drug Regimen")

# Show the plot
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
# Select a single mouse treated with Capomulin
mouse_id = "b128"  

# Filter data for the selected mouse
mouse_data = cleaned_data[(cleaned_data["Mouse ID"] == mouse_id) & (cleaned_data["Drug Regimen"] == "Capomulin")]

# Create the line plot
plt.figure(figsize=(4, 3))
plt.plot(mouse_data["Timepoint"], mouse_data["Tumor Volume (mm3)"], marker="o", linestyle="-", color="blue", linewidth=2)

# Add labels and title
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm続)")
plt.title(f"Tumor Volume Over Time for Mouse {mouse_id} (Capomulin)")
plt.grid(True)

# Show the plot
plt.show()


In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen
# Filter data for Capomulin regimen
capomulin_data = cleaned_data[cleaned_data["Drug Regimen"] == "Capomulin"]

# Group by Mouse ID and calculate the average tumor volume for each mouse
avg_tumor_vol = capomulin_data.groupby("Mouse ID")["Tumor Volume (mm3)"].mean()

# Get the corresponding mouse weights
mouse_weights = capomulin_data.groupby("Mouse ID")["Weight (g)"].mean()

# Create the scatter plot
plt.figure(figsize=(4, 3))
plt.scatter(mouse_weights, avg_tumor_vol, color="blue", alpha=0.7, edgecolors="black")

# Add labels and title
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Average Tumor Volume (mm続)")
plt.title("Mouse Weight vs. Average Tumor Volume (Capomulin Regimen)")
plt.grid(True)

# Show the plot
plt.show()



## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
# Calculate the correlation coefficient
correlation = st.pearsonr(mouse_weights, avg_tumor_vol)[0]
print(f"Correlation coefficient between mouse weight and tumor volume: {correlation:.2f}")

# Perform linear regression
slope, intercept, r_value, p_value, std_err = st.linregress(mouse_weights, avg_tumor_vol)

# Generate regression line values
regression_values = slope * mouse_weights + intercept

# Plot the scatter plot again
plt.figure(figsize=(4, 3))
plt.scatter(mouse_weights, avg_tumor_vol, color="blue", alpha=0.7, edgecolors="black", label="Data")

# Plot the regression line
plt.plot(mouse_weights, regression_values, color="red", linewidth=2, label="Linear Regression")

# Add labels, title, and legend
plt.xlabel("Mouse Weight (g)")
plt.ylabel("Average Tumor Volume (mm続)")
plt.title("Mouse Weight vs. Average Tumor Volume (Capomulin Regimen)")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()



