In [None]:
######################################################################
## 03_ANALYSIS: Calculating CpG o/e Ratio and Average Methylation - PYTHON
######################################################################

# This notebook computes CpG observed/expected (CpG o/e) for gene bodies (CDS) and promoters, overlaps each with WGBS methylation data. These steps are done in python before moving to R.

### Note that any block starting with "SCRIPT" or "ON COMMAND LINE" should be run in terminal and not in notebook

In [None]:
######################################################################
## BLOCK 1. Setup & Imports
######################################################################

import pandas as pd
import re
import numpy as np
from pathlib import Path
from Bio import SeqIO
import pyranges as pr

In [None]:
######################################################################
## BLOCK 2. Define Paths & Sample Names
######################################################################

# FASTA & GFF inputs
cds_fasta    = Path("~/Mytilus/genome/ncbi_dataset/data/GCF_021869535.1/cds_from_genomic.fna").expanduser()
genome_fasta = Path("~/Mytilus/genome/mytilus_californianus_genome.fasta").expanduser()
gff_file     = Path("~/Mytilus/genome/ncbi_dataset/data/GCF_021869535.1/genomic.gff").expanduser()

# WGBS methylation coverage (4× filtered, SNP-removed)
file_paths = [
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/44-mc9a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz",
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/45-mc11a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz",
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/46-mc14a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz",
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/47-mc15a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz",
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/48-mc18a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz",
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/49-mc19a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz",
    "/home/local/ADS/lsh/Mytilus-WGS/snps-removed2/50-mc24a_1_val_1_bismark_bt2_pe._merged.cov_4x.tab.gz"
]
sample_names = [
    'M44-9-1yr','M45-11-5yr','M46-14-4yr',
    'M47-15-6yr','M48-18-8yr','M49-19-8yr','M50-24-1yr'
]

In [None]:
######################################################################
## BLOCK 3. Load & Filter WGBS Methylation
######################################################################

dfs = []
for fp, name in zip(file_paths, sample_names):
    df = pd.read_csv(fp, sep="\t", compression="gzip", header=None,
                     names=["chrom","start","end","meth_percent","methylated","unmethylated"])
    df["sample"] = name
    df["methylated"]   = pd.to_numeric(df["methylated"],   errors="coerce")
    df["unmethylated"] = pd.to_numeric(df["unmethylated"], errors="coerce")
    df["total_reads"]  = df["methylated"] + df["unmethylated"]
    df = df[df["total_reads"] >= 4]
    df["meth_percent"] = 100.0 * df["methylated"] / df["total_reads"]
    dfs.append(df)

all_methyl = pd.concat(dfs, ignore_index=True)
all_methyl = all_methyl.rename(columns={"chrom":"Chromosome","start":"Start","end":"End"})
print("WGBS loaded:", all_methyl.shape)
print(all_methyl["meth_percent"].describe())

In [None]:
######################################################################
## BLOCK 4. Compute CpG o/e for CDS
######################################################################

cds_oe = []
for rec in SeqIO.parse(cds_fasta, "fasta"):
    seq = str(rec.seq).upper()
    cg, c, g, L = seq.count("CG"), seq.count("C"), seq.count("G"), len(seq)
    oe = (cg/(c*g))*((L**2)/(L-1)) if (c>0 and g>0 and L>1) else float("nan")
    cds_oe.append({"cds_id": rec.id, "cpg_oe": oe})
cpg_oe_cds_df = pd.DataFrame(cds_oe)
print("CDS CpG o/e:", cpg_oe_cds_df.shape)
cpg_oe_cds_df.head()

In [None]:
######################################################################
## BLOCK 5. Parse CDS Metadata
######################################################################

import re
from Bio import SeqIO
import pandas as pd

cds_meta = []

for rec in SeqIO.parse(cds_fasta, "fasta"):
    h  = rec.description
    cm = re.search(r"lcl\|([^_]+_[^_]+\.\d+)", h)               # Chromosome ID
    sm = re.search(r"location=(\d+)", h)                        # Start
    em = re.search(r"\.\.(\d+)", h)                             # End
    gid = re.search(r"\[gene=([^\]]+)\]", h)                    # Gene ID

    if not (cm and sm and em and gid):
        continue

    # Clean the gene_id to remove "LOC" and retain only digits
    raw_gene_id = gid.group(1)
    numeric_gene_id = re.search(r'(\d+)$', raw_gene_id)
    if not numeric_gene_id:
        continue

    cds_meta.append({
        "cds_id": rec.id,
        "Chromosome": cm.group(1),
        "Start": int(sm.group(1)),
        "End":   int(em.group(1)),
        "gene_id": numeric_gene_id.group(1)
    })

cds_df = pd.DataFrame(cds_meta)
print("Parsed CDS metadata:", cds_df.shape)
cds_df.head()

In [None]:
######################################################################
## BLOCK 6. Overlap CDS & Compute Avg Methylation → Save
######################################################################

# Give each CpG site a 1-bp range for PyRanges
meth_df = all_methyl.copy()
meth_df["End"] = meth_df["Start"]

# Build PyRanges objects
meth_ranges = pr.PyRanges(meth_df[["Chromosome", "Start", "End", "meth_percent"]])
cds_ranges  = pr.PyRanges(cds_df[["Chromosome", "Start", "End", "cds_id", "gene_id"]])

# Join CpGs to CDS intervals
cds_over = meth_ranges.join(cds_ranges)

# Average methylation per CDS
cds_meth_avg = (
    cds_over.df
    .groupby(["cds_id", "gene_id"], as_index=False)["meth_percent"]
    .mean()
    .rename(columns={"meth_percent": "avg_methylation"})
)

# Merge with CpG o/e table (if you want to include that)
merged_cds = pd.merge(cpg_oe_cds_df, cds_meth_avg, on="cds_id", how="inner")

# Write out final CDS methylation file
out1 = Path("~/Mytilus/genome/cpg_methylation_merged.tsv").expanduser()
merged_cds.to_csv(out1, sep="\t", index=False)
print("Merged CDS:", merged_cds.shape)
merged_cds.head()

In [None]:
######################################################################
## BLOCK 7. Define Promoters & Clamp to Scaffold Lengths
######################################################################

gff = pd.read_csv(gff_file, sep="\t", comment="#", header=None,
                  names=["seqid","src","type","start","end","score","strand","phase","attr"])
genes = gff[gff["type"].isin(["gene","mRNA"])].copy()
genes["gene_id"] = genes["attr"].str.extract(r"GeneID:([0-9]+)")

def get_promoter(row):
    if row["strand"] == "+":
        ps, pe = row["start"] - 1000, row["start"] - 1
    else:
        ps, pe = row["end"] + 1, row["end"] + 1000
    return pd.Series({
        "promoter_start": max(1, ps),
        "promoter_end": pe
    })

prom_coords = genes.apply(get_promoter, axis=1)
prom_df = pd.concat([genes[["seqid","strand","gene_id"]], prom_coords], axis=1)
prom_df = prom_df.rename(columns={"seqid":"Chromosome","gene_id":"promoter_id"})

# clamp to scaffold lengths
scf_len = {r.id: len(r.seq) for r in SeqIO.parse(genome_fasta, "fasta")}
prom_df["Start"] = prom_df.apply(lambda r: min(r.promoter_start, scf_len[r.Chromosome]), axis=1)
prom_df["End"]   = prom_df.apply(lambda r: min(r.promoter_end,   scf_len[r.Chromosome]), axis=1)

# write BED for external extraction
prom_df[["Chromosome","Start","End","promoter_id"]].to_csv(
    "promoters.bed", sep="\t", index=False, header=False
)
print("Promoters prepared:", prom_df.shape)
prom_df.head()

In [None]:
######################################################################
## BLOCK 8. Extract Promoter Sequences
######################################################################

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Load the whole genome into a dict (once)
genome_dict = SeqIO.to_dict(SeqIO.parse(genome_fasta, "fasta"))

# Extract each promoter by slicing the SeqRecord
promoter_records = []
for _, row in prom_df.iterrows():
    chrom = row.Chromosome
    # SeqIO is 0-based, end exclusive
    seq = genome_dict[chrom].seq[row.Start - 1 : row.End]
    promoter_records.append(
        SeqRecord(seq, id=row.promoter_id, description="")
    )

# Write out a FASTA of all promoters
SeqIO.write(promoter_records, "promoter_seqs.fasta", "fasta")
print(f"Extracted {len(promoter_records)} promoter sequences.")

In [None]:
######################################################################
## SCRIPT: Compute CpG o/e for Promoters
######################################################################

cd ~/Jupyter_Mytilus
conda activate bedtools
bedtools getfasta \
  -fi ~/Mytilus/genome/mytilus_californianus_genome.fasta \
  -bed promoters.bed \
  -fo promoter_seqs.fasta \
  -nameOnly

In [None]:
######################################################################
## BLOCK 9. Compute CpG o/e for Promoters
######################################################################

prom_oe = []
for rec in SeqIO.parse("promoter_seqs.fasta", "fasta"):
    pid = rec.id
    s = str(rec.seq).upper()
    cg = s.count("CG")
    c  = s.count("C")
    g  = s.count("G")
    L  = len(s)
    oe = (cg/(c*g)) * ((L**2)/(L-1)) if (c>0 and g>0 and L>1) else float("nan")
    prom_oe.append({"promoter_id": pid, "cpg_oe": oe})

prom_oe_df = pd.DataFrame(prom_oe)
print("Promoter CpG o/e:", prom_oe_df.shape)
prom_oe_df.head()

In [None]:
######################################################################
## BLOCK 10. Overlap Promoter Methylation & Merge → Save
######################################################################

import pyranges as pr

# 10a) Build PyRanges for WGBS methylation
meth_ranges = pr.PyRanges(
    all_methyl[["Chromosome", "Start", "End", "meth_percent"]]
)

# 10b) Build PyRanges for promoters: select only Chromosome/Start/End/promoter_id
# and rename 'promoter_id' → 'Name' so PyRanges sees the ID column correctly.
prom_ranges = pr.PyRanges(
    prom_df[["Chromosome", "Start", "End", "promoter_id"]]
        .rename(columns={"promoter_id": "Name"})
)

# 10c) Join and compute mean methylation per promoter
overlap_prom = meth_ranges.join(prom_ranges)
prom_meth_avg = (
    overlap_prom.df
    .groupby("Name", as_index=False)["meth_percent"]
    .mean()
    .rename(columns={"Name": "promoter_id", "meth_percent": "avg_methylation"})
)

# 10d) Merge with promoter CpG o/e and save
merged_prom = pd.merge(
    prom_oe_df, prom_meth_avg,
    on="promoter_id", how="inner"
)
merged_prom.to_csv(
    "promoter_cpg_methylation_merged.tsv",
    sep="\t", index=False
)

print("Merged promoters:", merged_prom.shape)
merged_prom.head()

In [None]:
######################################################################
## BLOCK 11. Filter & Save CDS Table
######################################################################

# Remove CDS with zero average methylation
filtered_cds = merged_cds[merged_cds["avg_methylation"] > 0].copy()
print("Filtered CDS:", filtered_cds.shape)

# Save to file — keeps gene_id, cds_id, and avg_methylation
out_cds = Path("~/Mytilus/genome/cpg_methylation_filtered.tsv").expanduser()
filtered_cds.to_csv(out_cds, sep="\t", index=False)
print(f"Wrote filtered CDS table to {out_cds}")

In [None]:
######################################################################
## BLOCK 12. Filter & Save Promoter Table
######################################################################

from pathlib import Path

# Step 1: Filter to only promoter_ids that match gene_ids in CDS
cds_ids = set(cds_df["gene_id"])
filtered_prom = merged_prom[merged_prom["promoter_id"].isin(cds_ids)].copy()

# Step 2: Drop zero methylation (optional)
filtered_prom = filtered_prom[filtered_prom["avg_methylation"] > 0]

# Step 3: Drop any remaining duplicates by promoter_id (1 per gene)
filtered_prom = filtered_prom.sort_values("avg_methylation", ascending=False)
filtered_prom = filtered_prom.drop_duplicates("promoter_id")

# Step 4: Save to file
out_prom = Path("~/Mytilus/genome/promoter_cpg_methylation_filtered.tsv").expanduser()
filtered_prom.to_csv(out_prom, sep="\t", index=False)

print("Filtered & deduplicated promoters:", filtered_prom.shape)
print(f"Wrote filtered promoter table to {out_prom}")