In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text

In [None]:
# Load data
df = pd.read_csv("./data/deg_data/alzheimer_data_degs.csv")

# Compute -log10 p-values
df["neg_log10_pval"] = -np.log10(df["pvals"])

In [None]:
df

In [None]:
# Add a column for differential expression classification
df["diffexpressed"] = "NS"
df.loc[(df["logfoldchanges"] > 1) & (df["pvals"] < 0.001), "diffexpressed"] = "UP"
df.loc[(df["logfoldchanges"] < -1) & (df["pvals"] < 0.001), "diffexpressed"] = "DOWN"

# Select top downregulated genes (prioritize by highest significance, then most negative log2FC)
top_downregulated = df[df["diffexpressed"] == "DOWN"]
top_downregulated = top_downregulated.sort_values(by=["neg_log10_pval", "logfoldchanges"], ascending=[False, True]).head(20)

# Select top upregulated genes (prioritize by highest significance, then most positive log2FC)
top_upregulated = df[df["diffexpressed"] == "UP"]
top_upregulated = top_upregulated.sort_values(by=["neg_log10_pval", "logfoldchanges"], ascending=[False, False]).head(81)

# Combine top genes
top_genes_combined = pd.concat([top_downregulated["genes"], top_upregulated["genes"]])
df_annotated = df[df["genes"].isin(top_genes_combined)]

In [None]:
# Create Volcano plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="logfoldchanges", y="neg_log10_pval", hue="diffexpressed", palette={"UP": "#bb0c00", "DOWN": "#00AFBB", "NS": "grey"}, alpha=0.7, edgecolor=None)

# Add threshold lines
plt.axhline(y=-np.log10(0.05), color='gray', linestyle='dashed')
plt.axvline(x=-1, color='gray', linestyle='dashed')
plt.axvline(x=1, color='gray', linestyle='dashed')

# Labels and formatting
plt.xlim(-9, 9)
plt.ylim(0, 320)
plt.xlabel("log2 Fold Change", fontsize=14)
plt.ylabel("-log10 p-value", fontsize=14)
plt.title("Volcano of DEGs (Disease vs Control)", fontsize=16)
plt.legend(title="Expression", loc="upper right")

plt.show()