In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, normaltest, anderson

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Separate inside and outside cells
inside_volumes = df[df["Inside or outside"] == 0]["Volume"]
outside_volumes = df[df["Inside or outside"] == 1]["Volume"]

# Perform normality tests
shapiro_inside = shapiro(inside_volumes)
shapiro_outside = shapiro(outside_volumes)

dagostino_inside = normaltest(inside_volumes)
dagostino_outside = normaltest(outside_volumes)

anderson_inside = anderson(inside_volumes)
anderson_outside = anderson(outside_volumes)

# Plot histograms and KDEs
plt.figure(figsize=(12, 6))
sns.histplot(inside_volumes, kde=True, color="blue", label="Inside", stat="density", bins=30)
sns.histplot(outside_volumes, kde=True, color="red", label="Outside", stat="density", bins=30)
plt.title("Distribution of Volumes for Inside and Outside Cells")
plt.xlabel("Volume")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.savefig("volume_distribution_normality.png")

# Output test results
{
    "Shapiro-Wilk Test": {
        "Inside": {"Statistic": shapiro_inside.statistic, "p-value": shapiro_inside.pvalue},
        "Outside": {"Statistic": shapiro_outside.statistic, "p-value": shapiro_outside.pvalue}
    },
    "D'Agostino and Pearson Test": {
        "Inside": {"Statistic": dagostino_inside.statistic, "p-value": dagostino_inside.pvalue},
        "Outside": {"Statistic": dagostino_outside.statistic, "p-value": dagostino_outside.pvalue}
    },
    "Anderson-Darling Test": {
        "Inside": {"Statistic": anderson_inside.statistic, "Critical Values": anderson_inside.critical_values, "Significance Levels": anderson_inside.significance_level},
        "Outside": {"Statistic": anderson_outside.statistic, "Critical Values": anderson_outside.critical_values, "Significance Levels": anderson_outside.significance_level}
    }
}



In [None]:
import pandas as pd
from scipy.stats import shapiro, normaltest, anderson

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 8.xlsx", engine="openpyxl")

# Extract the 'Sphericity' column
sphericity_data = df['Sphericity'].dropna()

# Perform Shapiro-Wilk Test
shapiro_stat, shapiro_p = shapiro(sphericity_data)

# Perform D’Agostino and Pearson’s Test
dagostino_stat, dagostino_p = normaltest(sphericity_data)

# Perform Anderson-Darling Test
anderson_result = anderson(sphericity_data)

# Print results
print("Shapiro-Wilk Test:")
print(f"Statistic: {shapiro_stat}, p-value: {shapiro_p}\n")

print("D’Agostino and Pearson’s Test:")
print(f"Statistic: {dagostino_stat}, p-value: {dagostino_p}\n")

print("Anderson-Darling Test:")
print(f"Statistic: {anderson_result.statistic}")
for i in range(len(anderson_result.critical_values)):
    print(f"Significance Level {anderson_result.significance_level[i]}: Critical Value {anderson_result.critical_values[i]}")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 4.xlsx", engine="openpyxl")

# Rename columns for clarity
df.columns = ["Object", "Embryo", "Time point", "Z", "Y", "X", "Volume", "Inside or outside"]

# Map inside/outside values to labels
df["Location"] = df["Inside or outside"].map({0: "Inside", 1: "Outside"})

# Set the correct order for time points
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Violin plot for all embryos combined
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x="Time point", y="Volume", hue="Location", order=time_order, split=True)
plt.title("Cell Volume Distribution by Location Over Time")
plt.ylabel("Volume")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("violin_all_embryos.png")
plt.close()

# Violin plots for each embryo
embryo_ids = df["Embryo"].unique()
for embryo_id in embryo_ids:
    embryo_df = df[df["Embryo"] == embryo_id]
    plt.figure(figsize=(10, 6))
    sns.violinplot(data=embryo_df, x="Time point", y="Volume", hue="Location", order=time_order, split=True)
    plt.title(f"Cell Volume Distribution for Embryo {embryo_id}")
    plt.ylabel("Volume")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"violin_embryo_{embryo_id}.png")
    plt.close()

print("Violin plots have been generated and saved for all embryos and each individual embryo.")



In [None]:
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 4.xlsx", engine="openpyxl")

# Kruskal-Wallis test for volume differences across time points
groups_by_time = [group["Volume"].values for name, group in df.groupby("Time point")]
kruskal_stat, kruskal_p = kruskal(*groups_by_time)

# Mann-Whitney U test for volume differences between inside (0) and outside (1)
inside_volumes = df[df["Inside or outside"] == 0]["Volume"]
outside_volumes = df[df["Inside or outside"] == 1]["Volume"]
mannwhitney_stat, mannwhitney_p = mannwhitneyu(inside_volumes, outside_volumes, alternative='two-sided')

# Print results
print("Kruskal-Wallis Test (Across Time Points):")
print(f"Statistic: {kruskal_stat:.4f}, p-value: {kruskal_p:.4e}")

print("\nMann-Whitney U Test (Inside vs Outside):")
print(f"Statistic: {mannwhitney_stat:.4f}, p-value: {mannwhitney_p:.4e}")



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 4.xlsx", engine="openpyxl")

# Get unique time points
time_points = df["Time point"].unique()

# Perform Mann-Whitney U test for each time point
results = []
for tp in time_points:
    subset = df[df["Time point"] == tp]
    inside_volumes = subset[subset["Inside or outside"] == 0]["Volume"]
    outside_volumes = subset[subset["Inside or outside"] == 1]["Volume"]
    
    stat, p_value = mannwhitneyu(inside_volumes, outside_volumes, alternative='two-sided')
    results.append((tp, stat, p_value))

# Display the results
for tp, stat, p_value in results:
    print(f"Time Point: {tp}")
    print(f"  Mann-Whitney U Statistic: {stat:.4f}")
    print(f"  p-value: {p_value:.4e}")
    print()



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 7.xlsx", engine="openpyxl")

# Prepare results dictionary
results = []

# Get unique embryos
embryos = df['Embryo'].unique()
time_points = df['Time point'].unique()

# Perform Mann-Whitney U test for each embryo overall
for embryo in embryos:
    sub_df = df[df['Embryo'] == embryo]
    inside = sub_df[sub_df['Inside or outside'] == 0]['Volume']
    outside = sub_df[sub_df['Inside or outside'] == 1]['Volume']
    if len(inside) > 0 and len(outside) > 0:
        stat, p = mannwhitneyu(inside, outside, alternative='two-sided')
        results.append({
            'Embryo': embryo,
            'Time point': 'Overall',
            'p-value': p
        })

# Perform Mann-Whitney U test for each embryo at each time point
for embryo in embryos:
    for tp in time_points:
        sub_df = df[(df['Embryo'] == embryo) & (df['Time point'] == tp)]
        inside = sub_df[sub_df['Inside or outside'] == 0]['Volume']
        outside = sub_df[sub_df['Inside or outside'] == 1]['Volume']
        if len(inside) > 0 and len(outside) > 0:
            stat, p = mannwhitneyu(inside, outside, alternative='two-sided')
            results.append({
                'Embryo': embryo,
                'Time point': tp,
                'p-value': p
            })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 5.xlsx", engine='openpyxl')

# Define the order of time points
time_order = ['1hr before cav', 'cavitation', '1hr after cav', '4hr after cav']

# Group by 'Time point' and 'Inside or outside', then calculate the mean volume
grouped = df.groupby(['Time point', 'Inside or outside'])['Volume'].mean().unstack()

# Reindex to ensure correct time point order
grouped = grouped.reindex(time_order)

# Plotting the line graph
plt.figure(figsize=(10, 6))
plt.plot(grouped.index, grouped[0], marker='o', label='Inside Cells')
plt.plot(grouped.index, grouped[1], marker='o', label='Outside Cells')
plt.xlabel('Time Point')
plt.ylabel('Average Cell Volume')
plt.title('Average Cell Volume Over Time for Inside and Outside Cells')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("cell_volume_over_time.png")
plt.show()



In [None]:
import pandas as pd
import numpy as np
from scipy.stats import kruskal

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 5.xlsx", engine="openpyxl")

# Rename columns for easier access
df.columns = [col.strip() for col in df.columns]
df.rename(columns={"Time point": "TimePoint", "Volume": "Volume", "Inside or outside": "InsideOutside"}, inplace=True)

# Define the correct order of time points
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Calculate average volume per time point for all cells, inside cells, and outside cells
avg_volumes = df.groupby("TimePoint")["Volume"].mean().reindex(time_order)
avg_volumes_inside = df[df["InsideOutside"] == 0].groupby("TimePoint")["Volume"].mean().reindex(time_order)
avg_volumes_outside = df[df["InsideOutside"] == 1].groupby("TimePoint")["Volume"].mean().reindex(time_order)

# Calculate percentage change relative to the first time point
def percentage_change(series):
    return (series - series.iloc[0]) / series.iloc[0] * 100

pct_change_all = percentage_change(avg_volumes)
pct_change_inside = percentage_change(avg_volumes_inside)
pct_change_outside = percentage_change(avg_volumes_outside)

# Prepare data for Kruskal-Wallis test across time points
grouped_all = [df[df["TimePoint"] == tp]["Volume"].values for tp in time_order]
grouped_inside = [df[(df["TimePoint"] == tp) & (df["InsideOutside"] == 0)]["Volume"].values for tp in time_order]
grouped_outside = [df[(df["TimePoint"] == tp) & (df["InsideOutside"] == 1)]["Volume"].values for tp in time_order]

# Perform Kruskal-Wallis test
kw_all = kruskal(*grouped_all)
kw_inside = kruskal(*grouped_inside)
kw_outside = kruskal(*grouped_outside)

# Display results
print("Percentage Change in Average Volume (All Cells):")
print(pct_change_all)
print("\nPercentage Change in Average Volume (Inside Cells):")
print(pct_change_inside)
print("\nPercentage Change in Average Volume (Outside Cells):")
print(pct_change_outside)

print("\nKruskal-Wallis Test Results:")
print(f"All Cells: statistic={kw_all.statistic:.4f}, p-value={kw_all.pvalue:.4f}")
print(f"Inside Cells: statistic={kw_inside.statistic:.4f}, p-value={kw_inside.pvalue:.4f}")
print(f"Outside Cells: statistic={kw_outside.statistic:.4f}, p-value={kw_outside.pvalue:.4f}")



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 5.xlsx", engine="openpyxl")

# Define the ordered time points
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Function to perform pairwise Mann-Whitney U tests between consecutive time points
def pairwise_tests(group_df, group_name):
    results = []
    for i in range(len(time_order) - 1):
        t1 = time_order[i]
        t2 = time_order[i + 1]
        v1 = group_df[group_df["Time point"] == t1]["Volume"]
        v2 = group_df[group_df["Time point"] == t2]["Volume"]
        stat, p = mannwhitneyu(v1, v2, alternative='two-sided')
        results.append({
            "Group": group_name,
            "Comparison": f"{t1} vs {t2}",
            "Statistic": stat,
            "p-value": p
        })
    return results

# Perform tests for all cells
all_results = pairwise_tests(df, "All Cells")

# Perform tests for inside cells
inside_df = df[df["Inside or outside"] == 0]
inside_results = pairwise_tests(inside_df, "Inside Cells")

# Perform tests for outside cells
outside_df = df[df["Inside or outside"] == 1]
outside_results = pairwise_tests(outside_df, "Outside Cells")

# Combine all results into a DataFrame
results_df = pd.DataFrame(all_results + inside_results + outside_results)

# Display the results
results_df


In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 7.xlsx", engine="openpyxl")

# Filter for inside cells only
inside_df = df[df["Inside or outside"] == 0]

# Define the ordered time points
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Store results
results = []

# Group by embryo
for embryo_id, group in inside_df.groupby("Embryo"):
    group = group.copy()
    group["Time point"] = pd.Categorical(group["Time point"], categories=time_order, ordered=True)
    
    # Calculate mean volume per time point
    mean_volumes = group.groupby("Time point")["Volume"].mean().sort_index()
    
    # Calculate percentage changes between consecutive time points
    pct_changes = []
    for i in range(len(time_order) - 1):
        t1 = time_order[i]
        t2 = time_order[i + 1]
        if t1 in mean_volumes and t2 in mean_volumes:
            pct_change = ((mean_volumes[t2] - mean_volumes[t1]) / mean_volumes[t1]) * 100
            pct_changes.append((t1, t2, pct_change))
    
    # Perform Mann-Whitney U test for each transition
    for t1, t2, pct_change in pct_changes:
        vols_t1 = group[group["Time point"] == t1]["Volume"]
        vols_t2 = group[group["Time point"] == t2]["Volume"]
        if len(vols_t1) > 0 and len(vols_t2) > 0:
            stat, p_value = mannwhitneyu(vols_t1, vols_t2, alternative="two-sided")
            results.append({
                "Embryo": embryo_id,
                "Transition": f"{t1} → {t2}",
                "Inside % Change": round(pct_change, 2),
                "p-value": round(p_value, 5),
                "Significant": p_value < 0.05
            })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
print(results_df)



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 7.xlsx", engine="openpyxl")

# Filter for outside cells only
outside_df = df[df["Inside or outside"] == 1]

# Define the time point order
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Prepare a dictionary to store percentage changes per embryo and transition
percentage_changes = {}

# Group by embryo
for embryo_id, group in outside_df.groupby("Embryo"):
    group = group.copy()
    group["Time point"] = pd.Categorical(group["Time point"], categories=time_order, ordered=True)
    group.sort_values("Time point", inplace=True)

    # Calculate mean volume per time point
    mean_volumes = group.groupby("Time point")["Volume"].mean()

    # Calculate percentage changes between consecutive time points
    changes = {}
    for i in range(len(time_order) - 1):
        t1 = time_order[i]
        t2 = time_order[i + 1]
        if t1 in mean_volumes and t2 in mean_volumes:
            v1 = mean_volumes[t1]
            v2 = mean_volumes[t2]
            pct_change = ((v2 - v1) / v1) * 100
            changes[f"{t1} → {t2}"] = pct_change

    percentage_changes[embryo_id] = changes

# Prepare data for Mann-Whitney U tests
results = {}
transitions = [f"{time_order[i]} → {time_order[i+1]}" for i in range(len(time_order)-1)]

for transition in transitions:
    values = [changes[transition] for changes in percentage_changes.values() if transition in changes]
    # Split into two halves arbitrarily for unpaired test (since we don't have pairing info)
    mid = len(values) // 2
    group1 = values[:mid]
    group2 = values[mid:]
    if len(group1) > 0 and len(group2) > 0:
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        results[transition] = {"Group1": group1, "Group2": group2, "p-value": p}

# Display results
for transition, res in results.items():
    print(f"Transition: {transition}")
    print(f"  Group1 changes: {res['Group1']}")
    print(f"  Group2 changes: {res['Group2']}")
    print(f"  Mann-Whitney U p-value: {res['p-value']:.4f}")
    print()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 5.xlsx", engine="openpyxl")

# Set plot style
sns.set(style="whitegrid")

# Define time points
time_points = df["Time point"].unique()
embryos = sorted(df["Embryo"].unique())

# Function to create dot plots with median and IQR
def create_dotplot(data, title, filename):
    plt.figure(figsize=(10, 6))
    ax = sns.stripplot(data=data, x="Time point", y="Sphericity", hue="Inside or outside", 
                       dodge=True, jitter=True, palette={0: "skyblue", 1: "salmon"}, alpha=0.6)

    # Add median and IQR annotations
    for tp in time_points:
        for io in [0, 1]:
            subset = data[(data["Time point"] == tp) & (data["Inside or outside"] == io)]
            if not subset.empty:
                median = subset["Sphericity"].median()
                q1 = subset["Sphericity"].quantile(0.25)
                q3 = subset["Sphericity"].quantile(0.75)
                xpos = list(time_points).index(tp) + (-0.2 if io == 0 else 0.2)
                ax.plot([xpos], [median], marker="D", color="black")
                ax.vlines(x=xpos, ymin=q1, ymax=q3, color="black", linewidth=2)

    plt.title(title)
    plt.ylabel("Sphericity")
    plt.legend(title="Cell Location", labels=["Inside", "Outside"])
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Create plot for all embryos combined
create_dotplot(df, "Sphericity Dot Plot - All Embryos", "dotplot_all_embryos.png")

# Create plots for each individual embryo
for embryo in embryos:
    embryo_data = df[df["Embryo"] == embryo]
    create_dotplot(embryo_data, f"Sphericity Dot Plot - Embryo {embryo}", f"dotplot_embryo_{embryo}.png")

print("Dot plots with median and IQR annotations have been saved.")



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kruskal, mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 3.xlsx", engine="openpyxl")

# Calculate ellipsoid volume using the formula: V = (4/3) * π * (x/2) * (y/2) * (z/2)
df["Volume"] = (4/3) * np.pi * (df["X"]/2) * (df["Y"]/2) * (df["Z"]/2)

# Overall comparison across time points using Kruskal-Wallis test
grouped_by_time = [group["Volume"].values for name, group in df.groupby("Time point")]
kruskal_result = kruskal(*grouped_by_time)

# Comparison between inside vs outside using Mann-Whitney U test
inside_volumes = df[df["Inside or outside"] == 0]["Volume"]
outside_volumes = df[df["Inside or outside"] == 1]["Volume"]
mannwhitney_result = mannwhitneyu(inside_volumes, outside_volumes)

# Visualization: Boxplot of volume across time points
plt.figure(figsize=(12, 6))
sns.boxplot(x="Time point", y="Volume", data=df)
plt.title("Cell Volume Across Time Points")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("volume_across_timepoints.png")
plt.close()

# Visualization: Boxplot of volume for inside vs outside
plt.figure(figsize=(8, 6))
sns.boxplot(x="Inside or outside", y="Volume", data=df)
plt.title("Cell Volume: Inside vs Outside")
plt.xticks([0, 1], ["Inside", "Outside"])
plt.tight_layout()
plt.savefig("volume_inside_vs_outside.png")
plt.close()

# Print statistical test results
print("Kruskal-Wallis Test across Time Points:")
print(f"Statistic: {kruskal_result.statistic}, p-value: {kruskal_result.pvalue}\n")

print("Mann-Whitney U Test between Inside vs Outside:")
print(f"Statistic: {mannwhitney_result.statistic}, p-value: {mannwhitney_result.pvalue}")



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 3.xlsx", engine="openpyxl")

# Rename columns for easier access
df.columns = [col.strip() for col in df.columns]
df.rename(columns={"Inside or outside": "Inside_outside"}, inplace=True)

# Get unique time points
time_points = df["Time point"].unique()

# Store results
results = []

# Perform Mann-Whitney U test for each time point
for tp in time_points:
    subset = df[df["Time point"] == tp]
    inside = subset[subset["Inside_outside"] == 0]["Sphericity"]
    outside = subset[subset["Inside_outside"] == 1]["Sphericity"]
    
    if len(inside) > 0 and len(outside) > 0:
        stat, p = mannwhitneyu(inside, outside, alternative='two-sided')
        results.append({"Time point": tp, "Statistic": stat, "p-value": p})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print statistical test results
print("Mann-Whitney U Test Results for Sphericity (Inside vs Outside) at Each Time Point:")
print(results_df)

# Visualization
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x="Time point", y="Sphericity", hue="Inside_outside")
plt.title("Sphericity Comparison Between Inside and Outside Cells Across Time Points")
plt.xlabel("Time Point")
plt.ylabel("Sphericity")
plt.legend(title="Inside (0) vs Outside (1)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("sphericity_comparison.png")
plt.show()



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 5.xlsx", engine="openpyxl")

# Prepare a results dictionary
results = {}

# Get unique embryos and timepoints
embryos = df['Embryo'].unique()
timepoints = df['Time point'].unique()

# Perform Mann-Whitney U test for each embryo overall
for embryo_id in embryos:
    sub_df = df[df['Embryo'] == embryo_id]
    inside = sub_df[sub_df['Inside or outside'] == 0]['Sphericity']
    outside = sub_df[sub_df['Inside or outside'] == 1]['Sphericity']
    if len(inside) > 0 and len(outside) > 0:
        stat, p = mannwhitneyu(inside, outside, alternative='two-sided')
        results[f'Embryo {embryo_id} Overall'] = {'U-statistic': stat, 'p-value': p}
    else:
        results[f'Embryo {embryo_id} Overall'] = {'U-statistic': None, 'p-value': None, 'Note': 'Insufficient data'}

# Perform Mann-Whitney U test for each embryo at each timepoint
for embryo_id in embryos:
    for tp in timepoints:
        sub_df = df[(df['Embryo'] == embryo_id) & (df['Time point'] == tp)]
        inside = sub_df[sub_df['Inside or outside'] == 0]['Sphericity']
        outside = sub_df[sub_df['Inside or outside'] == 1]['Sphericity']
        if len(inside) > 0 and len(outside) > 0:
            stat, p = mannwhitneyu(inside, outside, alternative='two-sided')
            results[f'Embryo {embryo_id} - {tp}'] = {'U-statistic': stat, 'p-value': p}
        else:
            results[f'Embryo {embryo_id} - {tp}'] = {'U-statistic': None, 'p-value': None, 'Note': 'Insufficient data'}

# Convert results to a DataFrame and save to CSV
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv("mann_whitney_results.csv")
results_df.head()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 2.xlsx", engine="openpyxl")

# Define the correct order of time points
time_order = ['1hr before cav', 'cavitation', '1hr after cav', '4hr after cav']

# Group by time point and inside/outside, then calculate mean sphericity
grouped = df.groupby(['Time point', 'Inside or outside'])['Sphericity'].mean().reset_index()

# Pivot the data for plotting
pivot_df = grouped.pivot(index='Time point', columns='Inside or outside', values='Sphericity')

# Reorder the index based on the specified time order
pivot_df = pivot_df.reindex(time_order)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(pivot_df.index, pivot_df[0], marker='o', label='Inside Cells')
plt.plot(pivot_df.index, pivot_df[1], marker='o', label='Outside Cells')
plt.xlabel('Time Point')
plt.ylabel('Average Sphericity')
plt.title('Average Sphericity of Inside vs Outside Cells Across Time Points')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("sphericity_line_graph.png")
plt.show()



In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 3.xlsx", engine="openpyxl")

# Ensure correct column names
df.columns = [col.strip() for col in df.columns]

# Group by time point and inside/outside status
grouped = df.groupby(["Time point", "Inside or outside"])["Sphericity"].mean().unstack()

# Also compute overall mean sphericity per time point
overall = df.groupby("Time point")["Sphericity"].mean()

# Sort time points in correct chronological order
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]
grouped = grouped.loc[time_order]
overall = overall.loc[time_order]

# Compute percentage changes between consecutive time points
def pct_change(series):
    return series.pct_change().dropna() * 100

inside_pct = pct_change(grouped[0])
outside_pct = pct_change(grouped[1])
overall_pct = pct_change(overall)

# Combine results into a summary table
summary = pd.DataFrame({
    "Inside (%)": inside_pct.round(2),
    "Outside (%)": outside_pct.round(2),
    "Overall (%)": overall_pct.round(2)
})

print("Verified Percentage Changes in Average Sphericity Between Time Points:")
print(summary)



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 3.xlsx", engine="openpyxl")

# Clean and rename columns
df.columns = [col.strip() for col in df.columns]
df.rename(columns={"Time point": "TimePoint", "Sphericity": "Sphericity", "Inside or outside": "InsideOutside"}, inplace=True)

# Define the time point order
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Function to perform Mann-Whitney U test between consecutive time points
def compare_distributions(group_df, label):
    results = []
    for i in range(len(time_order) - 1):
        tp1 = time_order[i]
        tp2 = time_order[i + 1]
        s1 = group_df[group_df["TimePoint"] == tp1]["Sphericity"]
        s2 = group_df[group_df["TimePoint"] == tp2]["Sphericity"]
        if not s1.empty and not s2.empty:
            stat, p = mannwhitneyu(s1, s2, alternative='two-sided')
            results.append({
                "Group": label,
                "Time Transition": f"{tp1} → {tp2}",
                "U Statistic": stat,
                "p-value": p,
                "Significant": "Yes" if p < 0.05 else "No"
            })
    return results

# Perform tests for overall, inside, and outside cells
overall_results = compare_distributions(df, "Overall")
inside_results = compare_distributions(df[df["InsideOutside"] == 0], "Inside")
outside_results = compare_distributions(df[df["InsideOutside"] == 1], "Outside")

# Combine all results into a DataFrame
all_results = pd.DataFrame(overall_results + inside_results + outside_results)

# Save results to CSV
all_results.to_csv("sphericity_distribution_comparison.csv", index=False)

# Display the results
all_results



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 5.xlsx", engine="openpyxl")

# Rename columns for easier access
df.columns = ["Object", "Embryo", "Time point", "Z", "Y", "X", "Sphericity", "Inside or outside"]

# Define time point order
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Initialize results list
results = []

# Loop through each embryo
for embryo_id in df["Embryo"].unique():
    embryo_data = df[df["Embryo"] == embryo_id]

    # Calculate mean sphericity for each group at first and last time point
    pct_changes = {"inside": None, "outside": None}
    for group in [0, 1]:  # 0 = inside, 1 = outside
        group_data = embryo_data[embryo_data["Inside or outside"] == group]
        first_tp = group_data[group_data["Time point"] == time_order[0]]["Sphericity"]
        last_tp = group_data[group_data["Time point"] == time_order[-1]]["Sphericity"]

        if not first_tp.empty and not last_tp.empty:
            first_mean = first_tp.mean()
            last_mean = last_tp.mean()
            pct_change = ((last_mean - first_mean) / first_mean) * 100
            pct_changes["inside" if group == 0 else "outside"] = pct_change

    # Perform Mann-Whitney U test if both groups have valid percentage changes
    if pct_changes["inside"] is not None and pct_changes["outside"] is not None:
        # Create dummy arrays for testing since we have one value per group
        stat, p_value = mannwhitneyu([pct_changes["inside"]], [pct_changes["outside"]])
        results.append({
            "Embryo": embryo_id,
            "Inside % Change": pct_changes["inside"],
            "Outside % Change": pct_changes["outside"],
            "U Statistic": stat,
            "p-value": p_value
        })
    else:
        results.append({
            "Embryo": embryo_id,
            "Inside % Change": pct_changes["inside"],
            "Outside % Change": pct_changes["outside"],
            "U Statistic": None,
            "p-value": None
        })

# Convert results to DataFrame and save
results_df = pd.DataFrame(results)
results_df.to_csv("sphericity_percentage_change_mannwhitney.csv", index=False)

print("Mann-Whitney U test results saved to 'sphericity_percentage_change_mannwhitney.csv'.")



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements sphericity 5.xlsx", engine="openpyxl")

# Rename columns for easier access
df.columns = ["Object", "Embryo", "Time point", "Z", "Y", "X", "Sphericity", "Inside or outside"]

# Define the timepoint order
timepoints = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Store significant results
significant_results = []

# Process each embryo
for embryo_id in sorted(df["Embryo"].unique()):
    embryo_data = df[df["Embryo"] == embryo_id]
    
    # Compute percentage change between consecutive timepoints
    for i in range(len(timepoints) - 1):
        tp1 = timepoints[i]
        tp2 = timepoints[i + 1]
        
        data_tp1 = embryo_data[embryo_data["Time point"] == tp1]
        data_tp2 = embryo_data[embryo_data["Time point"] == tp2]
        
        # Merge by Object if possible, otherwise compute group-wise mean change
        for group in [0, 1]:  # 0 = inside, 1 = outside
            s1 = data_tp1[data_tp1["Inside or outside"] == group]["Sphericity"]
            s2 = data_tp2[data_tp2["Inside or outside"] == group]["Sphericity"]
            if not s1.empty and not s2.empty:
                mean1 = s1.mean()
                mean2 = s2.mean()
                pct_change = ((mean2 - mean1) / mean1) * 100
                if group == 0:
                    inside_change = pct_change
                else:
                    outside_change = pct_change
        
        # Perform Mann-Whitney U test on sphericity changes
        inside_values = data_tp2[data_tp2["Inside or outside"] == 0]["Sphericity"].values
        outside_values = data_tp2[data_tp2["Inside or outside"] == 1]["Sphericity"].values
        if len(inside_values) > 0 and len(outside_values) > 0:
            stat, p = mannwhitneyu(inside_values, outside_values, alternative='two-sided')
            if p < 0.05:
                significant_results.append({
                    "Embryo": embryo_id,
                    "Transition": f"{tp1} → {tp2}",
                    "Inside % Change": round(inside_change, 2),
                    "Outside % Change": round(outside_change, 2),
                    "p-value": round(p, 5)
                })

# Convert results to DataFrame and display
significant_df = pd.DataFrame(significant_results)
print(significant_df)



In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel("Division stage of each cell 1.xlsx", engine="openpyxl")

# Select relevant columns and rename them for clarity
df = df.iloc[:, [1, 2, 3, 4]]  # Embryo, Time point, Inside or outside, Division stage
df.columns = ["Embryo", "Time point", "Inside or outside", "Division stage"]

# Filter out embryo 4
df = df[df["Embryo"] != 4]

# Convert division stage to numeric values (e.g., '5th' -> 5)
df["Division stage"] = df["Division stage"].str.extract(r'(\d+)').astype(float)

# Group by Time point and Inside/Outside status and calculate average division stage
result = df.groupby(["Time point", "Inside or outside"])["Division stage"].mean().unstack()

print("Average division stage by time point and cell location (0=inside, 1=outside):")
print(result)



In [None]:
import pandas as pd
import numpy as np
from scipy.stats import percentileofscore

# Load the Excel file
df = pd.read_excel("Division stage of each cell.xlsx", engine="openpyxl")

# Clean and preprocess the data
df = df[df['Embryo'] != 5]  # Omit cell 5
df = df.dropna(subset=['Division stage'])  # Drop rows with missing division stage
df['Division stage'] = df['Division stage'].str.extract(r'(\d+)').astype(int)  # Convert division stage to integer

# Define time points
time_points = df['Time point'].unique()

# Function to compute observed percentage
def compute_percentage(data):
    inside = data[data['Inside or outside'] == 0]
    outside = data[data['Inside or outside'] == 1]
    count = sum(inside['Division stage'].apply(lambda x: any(x < outside['Division stage'])))
    return 100 * count / len(inside) if len(inside) > 0 else np.nan

# Function to perform permutation test
def permutation_test(data, n_permutations=1000):
    observed = compute_percentage(data)
    combined = data.copy()
    permuted_stats = []
    for _ in range(n_permutations):
        combined['Inside or outside'] = np.random.permutation(combined['Inside or outside'].values)
        permuted_stats.append(compute_percentage(combined))
    p_value = sum(stat >= observed for stat in permuted_stats) / n_permutations
    return observed, p_value

# Run permutation test for each time point
results = []
for tp in time_points:
    subset = df[df['Time point'] == tp]
    observed, p_value = permutation_test(subset)
    results.append((tp, observed, p_value))

# Display results
for tp, perc, pval in results:
    print(f"{tp}: {perc:.2f}% of inside cells at earlier stage (p-value = {pval:.4f})")



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Division stage of each cell 1.xlsx", engine="openpyxl")

# Clean and preprocess the data
df = df[df['Embryo'] != 4]  # Exclude embryo 4
df = df[['Time point', 'Inside or outside', 'Division stage']].dropna()

# Convert division stage to numeric (e.g., '5th' -> 5)
df['Division stage'] = df['Division stage'].str.extract(r'(\d+)').astype(int)

# Perform Mann-Whitney U test for each time point
results = {}
for time_point in df['Time point'].unique():
    subset = df[df['Time point'] == time_point]
    inside = subset[subset['Inside or outside'] == 0]['Division stage']
    outside = subset[subset['Inside or outside'] == 1]['Division stage']
    stat, p_value = mannwhitneyu(inside, outside, alternative='two-sided')
    results[time_point] = p_value

# Print the p-values for each time point
for time_point, p_value in results.items():
    print(f"{time_point}: p-value = {p_value:.4f}")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset with volume information
df_volume = pd.read_excel("Embryo measurements with time point categories 6.xlsx", engine="openpyxl")

# Count the number of samples in each class (0 = inside, 1 = outside)
class_counts = df_volume['Inside or outside'].value_counts()

# Print the class distribution
print("Class Distribution:")
print(class_counts)

# Plot the class distribution
plt.figure(figsize=(6, 4))
class_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Class Distribution: Inside vs Outside Cells')
plt.xlabel('Class (0 = Inside, 1 = Outside)')
plt.ylabel('Number of Samples')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the datasets
df_volume = pd.read_excel("Embryo measurements with time point categories 6.xlsx", engine="openpyxl")
df_sphericity = pd.read_excel("Embryo measurements sphericity 4.xlsx", engine="openpyxl")

# Merge the two datasets on common columns to get Volume and Sphericity together
common_cols = ['Object', 'Embryo', 'Time point', 'Z', 'Y', 'X', 'Inside or outside']
df_merged = pd.merge(df_volume, df_sphericity[['Object', 'Sphericity']], on='Object')
df_merged = df_merged[['Volume', 'Sphericity', 'Inside or outside']]

# Features and target
X = df_merged[['Volume', 'Sphericity']]
y = df_merged['Inside or outside']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_prob_lr = lr.predict_proba(X_test)[:, 1]
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
auc_lr = auc(fpr_lr, tpr_lr)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_prob_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
auc_rf = auc(fpr_rf, tpr_rf)

# Plot both ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.2f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.savefig("roc_comparison.png")
plt.show()



In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the dataset with Volume and Sphericity
volume_df = pd.read_excel("Embryo measurements with time point categories 6.xlsx", engine="openpyxl")
sphericity_df = pd.read_excel("Embryo measurements sphericity 4.xlsx", engine="openpyxl")

# Merge the two datasets on common columns
merged_df = pd.merge(volume_df, sphericity_df[['Object', 'Sphericity']], on='Object')

# Features and target
X = merged_df[['Volume', 'Sphericity']]
y = merged_df['Inside or outside']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train Random Forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)

# Get feature importances
importances = rf.feature_importances_
features = X.columns

# Plot feature importances
plt.figure(figsize=(6, 4))
plt.bar(features, importances, color='skyblue')
plt.title('Feature Importance from Random Forest')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.savefig("feature_importance.png")
plt.show()

