In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Rename columns for easier access
df.columns = [col.strip() for col in df.columns]
df.rename(columns={"Inside or outside": "InsideOutside"}, inplace=True)

# Convert InsideOutside to categorical labels
df["InsideOutside"] = df["InsideOutside"].map({0: "Inside", 1: "Outside"})

# Create the violin plot
plt.figure(figsize=(12, 6))
sns.violinplot(data=df, x="Time point", y="Volume", hue="InsideOutside", split=True)
plt.title("Violin Plot of Embryo Volumes by Time Point and Inside/Outside Status")
plt.ylabel("Volume")
plt.xlabel("Time Point")
plt.xticks(rotation=45)
plt.legend(title="Cell Location")
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Define the correct order for time points
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]
df["Time point"] = pd.Categorical(df["Time point"], categories=time_order, ordered=True)

# Group by time point and inside/outside status, then calculate mean volume
grouped = df.groupby(["Time point", "Inside or outside"])["Volume"].mean().reset_index()

# Pivot the data for plotting
pivot_df = grouped.pivot(index="Time point", columns="Inside or outside", values="Volume")

# Plot the line chart
plt.figure(figsize=(10, 6))
plt.plot(pivot_df.index, pivot_df[0], marker='o', label='Inside (0)')
plt.plot(pivot_df.index, pivot_df[1], marker='o', label='Outside (1)')
plt.title("Average Nuclei Volumes Over Time")
plt.xlabel("Time Point")
plt.ylabel("Average Volume")
plt.legend()
plt.grid(True)
plt.tight_layout()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, normaltest, anderson

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Separate inside and outside cells
inside_volumes = df[df["Inside or outside"] == 0]["Volume"]
outside_volumes = df[df["Inside or outside"] == 1]["Volume"]

# Perform normality tests
shapiro_inside = shapiro(inside_volumes)
shapiro_outside = shapiro(outside_volumes)

dagostino_inside = normaltest(inside_volumes)
dagostino_outside = normaltest(outside_volumes)

anderson_inside = anderson(inside_volumes)
anderson_outside = anderson(outside_volumes)

# Plot histograms and KDEs
plt.figure(figsize=(12, 6))
sns.histplot(inside_volumes, kde=True, color="blue", label="Inside", stat="density", bins=30)
sns.histplot(outside_volumes, kde=True, color="red", label="Outside", stat="density", bins=30)
plt.title("Distribution of Volumes for Inside and Outside Cells")
plt.xlabel("Volume")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.savefig("volume_distribution_normality.png")

# Output test results
{
    "Shapiro-Wilk Test": {
        "Inside": {"Statistic": shapiro_inside.statistic, "p-value": shapiro_inside.pvalue},
        "Outside": {"Statistic": shapiro_outside.statistic, "p-value": shapiro_outside.pvalue}
    },
    "D'Agostino and Pearson Test": {
        "Inside": {"Statistic": dagostino_inside.statistic, "p-value": dagostino_inside.pvalue},
        "Outside": {"Statistic": dagostino_outside.statistic, "p-value": dagostino_outside.pvalue}
    },
    "Anderson-Darling Test": {
        "Inside": {"Statistic": anderson_inside.statistic, "Critical Values": anderson_inside.critical_values, "Significance Levels": anderson_inside.significance_level},
        "Outside": {"Statistic": anderson_outside.statistic, "Critical Values": anderson_outside.critical_values, "Significance Levels": anderson_outside.significance_level}
    }
}



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Get unique time points in the correct order
time_points = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Perform Mann-Whitney U test for each time point
results = []
for time_point in time_points:
    subset = df[df["Time point"] == time_point]
    inside_volumes = subset[subset["Inside or outside"] == 0]["Volume"]
    outside_volumes = subset[subset["Inside or outside"] == 1]["Volume"]
    
    stat, p_value = mannwhitneyu(inside_volumes, outside_volumes, alternative='two-sided')
    results.append({
        "Time Point": time_point,
        "U Statistic": stat,
        "p-value": p_value
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
results_df


In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Ensure time points are ordered correctly
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]
df["Time point"] = pd.Categorical(df["Time point"], categories=time_order, ordered=True)

# Function to compute percentage change between consecutive time points
def compute_percentage_change(group):
    avg_volumes = group.groupby("Time point")["Volume"].mean()
    pct_change = avg_volumes.pct_change().dropna() * 100
    return pct_change

# Calculate for all cells
overall_pct_change = compute_percentage_change(df)

# Calculate separately for inside and outside cells
inside_pct_change = compute_percentage_change(df[df["Inside or outside"] == 0])
outside_pct_change = compute_percentage_change(df[df["Inside or outside"] == 1])

# Combine results into a DataFrame
pct_change_df = pd.DataFrame({
    "All Cells": overall_pct_change,
    "Inside Cells": inside_pct_change,
    "Outside Cells": outside_pct_change
})

pct_change_df.reset_index(inplace=True)
pct_change_df.rename(columns={"Time point": "From Time Point"}, inplace=True)

print(pct_change_df)



In [None]:
import pandas as pd
from scipy.stats import wilcoxon
import numpy as np

# Load the Excel file
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Define the correct time point order
time_order = ["1hr before cav", "cavitation", "1hr after cav", "4hr after cav"]

# Function to compute average volumes per time point for a given subset
def compute_avg_volumes(data):
    return data.groupby("Time point")["Volume"].mean().reindex(time_order)

# Compute average volumes for each group
avg_all = compute_avg_volumes(df)
avg_inside = compute_avg_volumes(df[df["Inside or outside"] == 0])
avg_outside = compute_avg_volumes(df[df["Inside or outside"] == 1])

# Function to compute percentage changes between consecutive time points
def compute_percentage_changes(avg_series):
    return ((avg_series.values[1:] - avg_series.values[:-1]) / avg_series.values[:-1]) * 100

# Compute percentage changes
pct_all = compute_percentage_changes(avg_all)
pct_inside = compute_percentage_changes(avg_inside)
pct_outside = compute_percentage_changes(avg_outside)

# Perform Wilcoxon signed-rank test for each group
wilcoxon_all = wilcoxon(pct_all)
wilcoxon_inside = wilcoxon(pct_inside)
wilcoxon_outside = wilcoxon(pct_outside)

# Display results
results = pd.DataFrame({
    "Group": ["All Cells", "Inside Cells", "Outside Cells"],
    "Wilcoxon Statistic": [wilcoxon_all.statistic, wilcoxon_inside.statistic, wilcoxon_outside.statistic],
    "p-value": [wilcoxon_all.pvalue, wilcoxon_inside.pvalue, wilcoxon_outside.pvalue]
})

print("Wilcoxon Signed-Rank Test Results for Percentage Changes Between Time Points:")
print(results)



In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import statsmodels.api as sm

# Load the dataset
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Select features and target
features = ['Z', 'Y', 'X', 'Volume']
target = 'Inside or outside'

X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit logistic regression model using sklearn
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Fit logistic regression model using statsmodels for summary
X_train_sm = sm.add_constant(X_train)
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit(disp=False)

# Output results
print("Model Summary (Statsmodels):")
print(result.summary())
print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Select features and target
features = ['Z', 'Y', 'X', 'Volume']
X = df[features]
y = df['Inside or outside']

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train a Random Forest classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report



In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_excel("Embryo measurements with time point categories 2.xlsx", engine="openpyxl")

# Select features and target
features = ['Z', 'Y', 'X', 'Volume']
X = df[features]
y = df['Inside or outside']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_resampled, y_resampled)

# Get feature importances
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(8, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest (SMOTE Balanced)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig("feature_importance_rf_smote.png")
plt.show()

