In [None]:
import itertools
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

In [None]:
# Plot styling.
plt.style.use(["seaborn-v0_8-white", "seaborn-v0_8-paper"])
colors = np.asarray(
    ["#9e0059", "#6da7de", "#ee266d", "#dee000", "#eb861e", "#63c5b5"]
)
sns.set_palette(colors)
sns.set_context("paper")

### Convert gene names to UniProt ids

In [None]:
gene_names = set()
with open(
    "../data/external/crocosphaera_watsonii_wh_8501_200811.fasta"
) as f_in:
    for line in f_in:
        if line.startswith(">"):
            gene_names.add(line.split()[2])

gene_to_uniprot = pd.read_csv(
    "../data/external/crocosphaera_watsonii_wh_8501_uniprot_20210912.tab",
    sep="\t",
    usecols=["Entry", "Gene names"],
)
gene_to_uniprot["Gene names"] = gene_to_uniprot["Gene names"].str.split()
gene_to_uniprot = (
    gene_to_uniprot.explode("Gene names")
    .set_index("Gene names")
    .squeeze()
    .to_dict()
)

### Load protein abundance data

Impute missing values by half of the minimum measured abundance.

In [None]:
col_day = (
    [
        f"151222_WH8501diel_T{i}_2ug" for i in itertools.chain(
            range (1, 5), range (13, 17)
        )
    ]
)
col_night = [f"151222_WH8501diel_T{i}_2ug" for i in range (5, 13)]

protein_abundances = pd.read_csv(
    "../data/processed/160214_Crocodiel_Full_rawdata_noheader_fig_may7annotation.csv",
    usecols=[
        "Identified Proteins (1170)", "Molecular Weight", *col_day, *col_night
    ],
    thousands=",",
)

# Map gene names to UniProt ids.
protein_abundances["uniprot_id"] = (
    protein_abundances["Identified Proteins (1170)"].str.split().str[1]
    .map(gene_to_uniprot)
)

protein_abundances = (
    protein_abundances[protein_abundances["Molecular Weight"] != "?"]
    .drop(columns=["Identified Proteins (1170)", "Molecular Weight"])
    .sort_values("uniprot_id")
    .set_index("uniprot_id")
    .drop_duplicates()
)

# Replace missing observations by half the minimum measured.
protein_abundances = protein_abundances.replace(0, np.nan)
min_non_zero = protein_abundances.min().min() / 2
protein_abundances = protein_abundances.fillna(min_non_zero)

### Protein differences between night and day

- Log2 fold change of the mean protein abundance between night and day.
- P-value between individual measurements in each of time and day. P-values are corrected for multiple testing using the Benjamini-Hochberg procedure.

In [None]:
# Calculate log fold changes between day and night.
protein_abundances["day"] = protein_abundances[col_day].mean(axis=1)
protein_abundances["night"] = protein_abundances[col_night].mean(axis=1)

protein_abundances["log2fc"] = np.log2(
    protein_abundances["day"] / protein_abundances["night"]
)

In [None]:
# Calculate p-values between day and night.
with warnings.catch_warnings(action="ignore"):
    protein_abundances["pvalue"] = protein_abundances.apply(
        lambda row: stats.ttest_ind(row[col_day], row[col_night])[1], axis=1
    )
protein_abundances["pvalue"] = protein_abundances["pvalue"].fillna(1)
protein_abundances["pvalue"] = stats.false_discovery_control(
    protein_abundances["pvalue"], method="bh"
)
protein_abundances["-log10pvalue"] = -np.log10(protein_abundances["pvalue"])

In [None]:
# Export significant proteins (and include cluster assignments).
cluster_ids = pd.read_csv("cluster_ids.csv", index_col="uniprot_id")

signif_proteins = protein_abundances[["log2fc", "pvalue"]]
signif_proteins = signif_proteins[
    (signif_proteins["log2fc"].abs() > 1) & (signif_proteins["pvalue"] < 0.05)
]
signif_proteins["gene_name"] = signif_proteins.index.map(
    dict((v, k) for k, v in gene_to_uniprot.items())
)
signif_proteins = signif_proteins.join(cluster_ids)
signif_proteins = signif_proteins.sort_values(["cluster_id", "gene_name"])
col_order = ["gene_name", "cluster_id", "log2fc", "pvalue"]
signif_proteins[col_order].to_csv("signif_proteins.csv")

### Volcano plot

In [None]:
width = 3.5
height = width / 1.618
fig, ax = plt.subplots(figsize=(width, height))

ax.scatter(
    protein_abundances["log2fc"],
    protein_abundances["-log10pvalue"],
    marker=".",
    c="black",
)

ax.scatter(
    signif_proteins["log2fc"],
    -np.log10(signif_proteins["pvalue"]),
    marker=".",
    c=colors[signif_proteins["cluster_id"] - 1],
)

# Highlight significance thresholds.
plt.axhline(y=-np.log10(0.05), c="darkgray", ls="--")
plt.axvline(x=1, c="darkgray", ls="--")
plt.axvline(x=-1, c="darkgray", ls="--")

ax.set_xlabel("log2(fold change day to night)")
ax.set_ylabel("log10(p-value)")

ax.set_xlim(-(xlim := np.abs(ax.get_xlim()).max()), xlim)
ax.set_ylim(0, ax.get_ylim()[1])

plt.savefig("signif_proteins.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()