In [None]:
######################################################################
## 01_ANALYSIS: Genomic data processing and SNP identification
######################################################################

### Note that any block starting with "SCRIPT" or "ON COMMAND LINE" should be run in terminal and not in notebook

In [None]:
######################################################################
## 1: SCRIPT: Trimming
######################################################################

conda activate trim-galore

TRIM_GALORE_DIR=/home/local/ADS/lsh/anaconda3/envs/trim-galore/bin
TRIM_DIR=/home/local/ADS/lsh/Mytilus-WGS/trimmed

DIRS=(
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc9a"
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc11a"
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc14a"
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc15a"
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc18a"
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc19a"
  "/home/local/ADS/lsh/Yi_Lab/Lab_Archives/Raw_Seq_Data/20240501_Psomagen_mussel_pilot/mc24a"
)

trim_command=""
for dir in "${DIRS[@]}"; do
  s=$(basename "$dir")
  trim_command="$trim_command --paired ${dir}/${s}_1.fastq.gz ${dir}/${s}_2.fastq.gz"
done

$TRIM_GALORE_DIR/trim_galore --path_to_cutadapt /home/local/ADS/lsh/anaconda3/envs/trim-galore/bin/cutadapt \
  --output_dir $TRIM_DIR $trim_command

In [None]:
######################################################################
## 2: SCRIPT: Bowtie2 mapping, mark-dup, index (array SLURM)
######################################################################
## (SLURM SCRIPT: wgs_bowtie2_map.slurm)

#!/usr/bin/env bash
#SBATCH --job-name=wgs_map_bt2
#SBATCH --partition=debug
#SBATCH --cpus-per-task=24
#SBATCH --mem=64G
#SBATCH --time=1-00:00:00
#SBATCH --output=logs/wgs_bt2.%A_%a.out
#SBATCH --error=logs/wgs_bt2.%A_%a.err

set -euo pipefail
BISM="/home/local/ADS/lsh/anaconda3/envs/bismark/bin"
BOWTIE2="$BISM/bowtie2"
SAMTOOLS="$BISM/samtools"

SAMPLES_FILE="$HOME/Mytilus/m-samples.txt"
BT2_PREFIX="$HOME/Mytilus/genome/bt2/mytilus_californianus_genome"
MAPDIR="$HOME/Mytilus-WGS/mapping_bt2"
mkdir -p "$MAPDIR" "$MAPDIR/tmp"

if [[ -z "${SAMPLE:-}" ]]; then
  : "${SLURM_ARRAY_TASK_ID:?Array index required or set SAMPLE=...}"
  SAMPLE="$(sed -n "${SLURM_ARRAY_TASK_ID}p" "$SAMPLES_FILE")"
fi

R1="$HOME/Mytilus-WGS/trimmed/${SAMPLE}_1_val_1.fq.gz"
R2="$HOME/Mytilus-WGS/trimmed/${SAMPLE}_2_val_2.fq.gz"

for f in 1 2 3 4 rev.1 rev.2; do
  while [[ ! -s "${BT2_PREFIX}.${f}.bt2" ]]; do sleep 5; done
done

TMPPFX="$MAPDIR/tmp_${SAMPLE}"
LOG="$MAPDIR/${SAMPLE}.bt2.log"

$BOWTIE2 -x "$BT2_PREFIX" -1 "$R1" -2 "$R2" -p "${SLURM_CPUS_PER_TASK}" 2> "$LOG" \
| $SAMTOOLS view -b -@ "${SLURM_CPUS_PER_TASK}" \
| $SAMTOOLS sort -n -@ "${SLURM_CPUS_PER_TASK}" -m 2G -T "${TMPPFX}/nsort" -o "$MAPDIR/${SAMPLE}.namesorted.bam"

$SAMTOOLS fixmate -@ "${SLURM_CPUS_PER_TASK}" -m "$MAPDIR/${SAMPLE}.namesorted.bam" "$MAPDIR/${SAMPLE}.fixmate.bam"
$SAMTOOLS sort    -@ "${SLURM_CPUS_PER_TASK}" -o "$MAPDIR/${SAMPLE}.coordsorted.bam" "$MAPDIR/${SAMPLE}.fixmate.bam"
$SAMTOOLS markdup -@ "${SLURM_CPUS_PER_TASK}" -r "$MAPDIR/${SAMPLE}.coordsorted.bam" "$MAPDIR/${SAMPLE}.mkdup.bam"
$SAMTOOLS index   -@ "${SLURM_CPUS_PER_TASK}" "$MAPDIR/${SAMPLE}.mkdup.bam"
$SAMTOOLS flagstat -@ "${SLURM_CPUS_PER_TASK}" "$MAPDIR/${SAMPLE}.mkdup.bam" > "$MAPDIR/${SAMPLE}.flagstat.txt"

rm -f "$MAPDIR/${SAMPLE}.namesorted.bam" "$MAPDIR/${SAMPLE}.fixmate.bam" "$MAPDIR/${SAMPLE}.coordsorted.bam"
echo "Done: $MAPDIR/${SAMPLE}.mkdup.bam"

In [None]:
######################################################################
## 3: IN TERMINAL: Average coverage and Coverage per-base
######################################################################

conda activate wgs-coverage-env
cd ~/Mytilus-WGS
mkdir -p wgs_coverage_bt2

mapfile -t BAMS < <(ls -1 mapping_bt2/*.mkdup.bam)
for b in "${BAMS[@]}"; do
  s=$(basename "$b" .mkdup.bam)
  echo "==> mosdepth (per-base) $s"
  # NOTE: **NO -n** so that *.per-base.bed.gz is produced
  mosdepth --fast-mode --threads 8 "wgs_coverage_bt2/${s}" "$b"
done

echo -e "Sample\tMeanDepth"
for f in wgs_coverage_bt2/*.mosdepth.summary.txt; do
  s=$(basename "$f" .mosdepth.summary.txt)
  awk -v s="$s" 'NR==2 {print s "\t" $4}' "$f"
done

# Sanity: ensure the per-base files now exist
ls -1 wgs_coverage_bt2/*.per-base.bed.gz

In [None]:
######################################################################
## 4A: SCRIPT: SNP calling
## This calls variants (bcftools) from the new mapping_bt2/*.mkdup.bam, indexes missing .bai, and emits both stringent (for π/θ/Ne) and relaxed VCFs.
######################################################################

#!/usr/bin/env bash
#SBATCH --job-name=bcf_call_bt2
#SBATCH --partition=debug
#SBATCH --cpus-per-task=16
#SBATCH --mem=64G
#SBATCH --time=24:00:00
#SBATCH --output=/home/local/ADS/lsh/Mytilus-WGS/logs/bcf_call_bt2.%j.out
#SBATCH --error=/home/local/ADS/lsh/Mytilus-WGS/logs/bcf_call_bt2.%j.err

set -euo pipefail

# ========= paths =========
WD=/home/local/ADS/lsh/Mytilus-WGS
REF=/home/local/ADS/lsh/Mytilus/prepped_genome/mytilus_californianus_genome.fasta
BAMDIR="$WD/mapping_bt2"
OUTDIR="$WD/snps"
LOGDIR="$WD/logs"
mkdir -p "$OUTDIR" "$LOGDIR"

# ========= env =========
# bcftools-env should contain bcftools, samtools, tabix (+fill-tags plugin)
source ~/.bashrc
eval "$(conda shell.bash hook)"
conda activate bcftools-env

# ========= inputs =========
# collect all mkdup BAMs
mapfile -t BAMS < <(ls -1 "${BAMDIR}"/*.mkdup.bam)
echo "[INFO] Found ${#BAMS[@]} BAMs:"
printf '  %s\n' "${BAMS[@]}"

# sanity: reference + indexes
[[ -s "$REF" ]] || { echo "[ERR] Missing reference: $REF" ; exit 1; }
[[ -s "${REF}.fai" ]] || samtools faidx "$REF"

# ========= outputs =========
RAW_VCF="$OUTDIR/bcftools.snps.raw.vcf.gz"
RAW_TBI="${RAW_VCF}.tbi"
PG_TMP="$OUTDIR/_pg.tmp.vcf.gz"
PG_TMP_TBI="${PG_TMP}.tbi"
PG_TAG="$OUTDIR/bcftools.popgen.tags.vcf.gz"
PG_TAG_TBI="${PG_TAG}.tbi"
PG_STRICT="$OUTDIR/bcftools.popgen.strict.vcf.gz"
PG_STRICT_TBI="${PG_STRICT}.tbi"

echo "[INFO] Writing:"
echo "  RAW:     $RAW_VCF"
echo "  TAGGED:  $PG_TAG"
echo "  STRICT:  $PG_STRICT"

# ========= run =========
echo "[INFO] mpileup -> call (SNPs only, multiallelic OK; no indels)"
bcftools mpileup \
  --threads ${SLURM_CPUS_PER_TASK} \
  --skip-indels \
  -d 10000 \
  -f "$REF" \
  "${BAMS[@]}" -Ou \
| bcftools call \
    --threads ${SLURM_CPUS_PER_TASK} \
    --skip-variants indels \
    --multiallelic-caller \
    --variants-only \
    -Oz -o "$RAW_VCF"

tabix -p vcf "$RAW_VCF"

# biallelic-only intermediate for tagging (keeps PASS later)
echo "[INFO] make biallelic-only temp (SNPs, PASS later)"
bcftools view -f PASS -v snps -m2 -M2 -Oz -o "$PG_TMP" "$RAW_VCF"
tabix -p vcf "$PG_TMP"

# fill tags (MAF, F_MISSING, etc.)
echo "[INFO] +fill-tags (MAF, F_MISSING, AC, AN, NS)"
bcftools +fill-tags "$PG_TMP" -- -t MAF,F_MISSING,AC,AN,NS -Oz -o "$PG_TAG"
tabix -p vcf "$PG_TAG"

# strict pop-gen set: MAF>=0.05 and <=20% missing
echo "[INFO] strict filter (MAF>=0.05 && F_MISSING<=0.20)"
bcftools view -i 'MAF>=0.05 && F_MISSING<=0.20' "$PG_TAG" -Oz -o "$PG_STRICT"
tabix -p vcf "$PG_STRICT"

# cleanup temp
rm -f "$PG_TMP" "$PG_TMP_TBI"

echo "[DONE] $(date)"
echo "[OUT] RAW     : $RAW_VCF"
echo "[OUT] STRICT  : $PG_STRICT"

# quick stats (handy)
bcftools stats "$RAW_VCF" > "$OUTDIR/bcftools.snps.raw.stats.txt" || true
bcftools stats "$PG_STRICT" > "$OUTDIR/bcftools.popgen.strict.stats.txt" || true


In [None]:
######################################################################
## 4B: SCRIPT: SNP calling with FreeBayes
######################################################################

#!/usr/bin/env bash
#SBATCH --job-name=freebayes_bt2
#SBATCH --cpus-per-task=8
#SBATCH --mem=16G
#SBATCH --time=24:00:00
#SBATCH --output=logs/freebayes_%A_%a.out
#SBATCH --error=logs/freebayes_%A_%a.err
#SBATCH --array=0-6    # will reset just below

set -euo pipefail

# *** critical: conda hook so "conda activate" works in batch ***
source ~/.bashrc
eval "$(conda shell.bash hook)"
conda activate freebayes

REF="/home/local/ADS/lsh/Mytilus/prepped_genome/mytilus_californianus_genome.fasta"
cd ~/Mytilus-WGS/mapping_bt2

mapfile -t BAMS < <(ls -1 *.mkdup.bam)
BAM="${BAMS[$SLURM_ARRAY_TASK_ID]}"
SAMPLE="${BAM%%.mkdup.bam}"

[[ -s "${REF}.fai" ]] || samtools faidx "$REF"
[[ -s "${BAM}.bai" ]] || samtools index "$BAM"

freebayes -f "$REF" "$BAM" > "${SAMPLE}.raw.vcf" 2> "${SAMPLE}.freebayes.log"
grep -m1 '^##source=free' "${SAMPLE}.raw.vcf" >/dev/null
bgzip -@ "${SLURM_CPUS_PER_TASK}" -f "${SAMPLE}.raw.vcf"
tabix -p vcf "${SAMPLE}.raw.vcf.gz"

In [None]:
######################################################################
## 5. SCRIPT: Merge SNP calls FreeBayes
######################################################################

conda activate freebayes
cd ~/Mytilus-WGS/mapping_bt2
bcftools merge *.raw.vcf.gz -Oz -o cohort.raw.vcf.gz
tabix -p vcf cohort.raw.vcf.gz

In [None]:
######################################################################
## 6: SCRIPT: SNP → BED (all SNPs) and CpG-only SNP BED, Make callable mask, Combine “callable” with SNP masks to get masked regions for diversity
######################################################################

#!/usr/bin/env bash
#SBATCH --job-name=callable_masks
#SBATCH --partition=debug
#SBATCH --cpus-per-task=4
#SBATCH --mem=16G
#SBATCH --time=06:00:00
#SBATCH --output=logs/callable_masks.%j.out
#SBATCH --error=logs/callable_masks.%j.err
set -euo pipefail

WD="$HOME/Mytilus-WGS"
COVDIR="$WD/wgs_coverage_bt2"
OUTDIR="$WD/snps"
mkdir -p "$OUTDIR" "$WD/logs"

# tools (bedtools recommended; awk fallback will be used if not found)
BEDTOOLS=$(command -v bedtools || true)

# samples = inferred from per-base files
mapfile -t SAMPLES < <(cd "$COVDIR" && ls -1 *.per-base.bed.gz | sed 's/\.per-base\.bed\.gz$//')

echo "[INFO] Samples: ${SAMPLES[*]}"

# function: merge adjacent intervals with awk (fallback if bedtools missing)
merge_awk='
BEGIN{OFS="\t"}
NR==1{c=$1;s=$2;e=$3;next}
{
  if($1==c && $2==e){ e=$3 }  # extend if adjacent
  else { print c,s,e; c=$1; s=$2; e=$3 }
}
END{ if(NR>0) print c,s,e }
'

# 1) per-sample callable BEDs
declare -a CALL_BEDS=()
for s in "${SAMPLES[@]}"; do
  sum="$COVDIR/${s}.mosdepth.summary.txt"
  pbase="$COVDIR/${s}.per-base.bed.gz"
  [[ -s "$sum" && -s "$pbase" ]] || { echo "[ERR] Missing mosdepth outputs for $s"; exit 1; }

  mean=$(awk 'NR==2{print $4}' "$sum")
  # thresholds: MIN=4, MAX=ceil(3*mean) capped at 60
  MIN=4
  MAX=$(python3 - <<PY
import math
m=float("$mean")
print(int(min(60, math.ceil(3*m))))
PY
)
  echo "[INFO] $s mean=$mean  MIN=$MIN  MAX=$MAX"

  out="$OUTDIR/${s}.callable.bed"
  # select segments with depth between MIN and MAX inclusive, then merge
  if [[ -n "$BEDTOOLS" ]]; then
    zcat "$pbase" \
      | awk -v mn="$MIN" -v mx="$MAX" 'BEGIN{OFS="\t"} ($4>=mn && $4<=mx){print $1,$2,$3}' \
      | "$BEDTOOLS" sort -i stdin \
      | "$BEDTOOLS" merge -i stdin \
      > "$out"
  else
    zcat "$pbase" \
      | awk -v mn="$MIN" -v mx="$MAX" 'BEGIN{OFS="\t"} ($4>=mn && $4<=mx){print $1,$2,$3}' \
      | sort -k1,1 -k2,2n -k3,3n \
      | awk "$merge_awk" \
      > "$out"
  fi
  [[ -s "$out" ]] || { echo "[ERR] no callable intervals for $s"; exit 1; }
  CALL_BEDS+=("$out")
done

# 2) cohort masks
#    intersection (callable in ALL samples)
COHORT_ALL="$OUTDIR/cohort_callable_all.bed"
#    union (callable in AT LEAST ONE sample)
COHORT_ANY="$OUTDIR/cohort_callable_any.bed"

if [[ -n "$BEDTOOLS" ]]; then
  # union
  "$BEDTOOLS" sort -i <(cat "${CALL_BEDS[@]}") | "$BEDTOOLS" merge -i stdin > "$COHORT_ANY"
  # intersection (iterate pairwise to avoid multiinter dependency)
  cp "${CALL_BEDS[0]}" "$COHORT_ALL.tmp"
  for ((i=1;i<${#CALL_BEDS[@]};i++)); do
    "$BEDTOOLS" intersect -a "$COHORT_ALL.tmp" -b "${CALL_BEDS[$i]}" > "$COHORT_ALL.tmp2"
    mv "$COHORT_ALL.tmp2" "$COHORT_ALL.tmp"
  done
  mv "$COHORT_ALL.tmp" "$COHORT_ALL"
else
  # union (awk fallback): concat -> sort -> merge
  cat "${CALL_BEDS[@]}" \
    | sort -k1,1 -k2,2n -k3,3n \
    | awk "$merge_awk" > "$COHORT_ANY"

  # intersection fallback: build coverage counts by collapsing starts/ends
  # (simpler to require bedtools for intersection on big genomes)
  echo "[ERR] bedtools not found; cannot compute robust intersection without it."
  echo "      Please install bedtools or run the intersection later."
  COHORT_ALL="/dev/null"
fi

# 3) quick stats
for b in "${CALL_BEDS[@]}"; do
  bp=$(awk '{s+=$3-$2}END{print s+0}' "$b")
  echo "[STATS] $(basename "$b"): ${bp} bp"
done

if [[ -s "$COHORT_ANY" ]]; then
  bp_any=$(awk '{s+=$3-$2}END{print s+0}' "$COHORT_ANY")
  echo "[STATS] cohort_callable_any: ${bp_any} bp"
fi

if [[ -s "$COHORT_ALL" && "$COHORT_ALL" != "/dev/null" ]]; then
  bp_all=$(awk '{s+=$3-$2}END{print s+0}' "$COHORT_ALL")
  echo "[STATS] cohort_callable_all: ${bp_all} bp"
fi

echo "[DONE] Callable masks in $OUTDIR"

In [None]:
######################################################################
## 7: SCRIPT: Run genome-wide π, θ, Tajima’s D, Nₑ as a single SLURM job (SLURM SCRIPT: diversity_all.slurm)
######################################################################

#!/usr/bin/env bash
#SBATCH --job-name=diversity_all
#SBATCH --partition=debug
#SBATCH --cpus-per-task=8
#SBATCH --mem=32G
#SBATCH --time=12:00:00
#SBATCH --output=logs/diversity_all.%j.out
#SBATCH --error=logs/diversity_all.%j.err
set -euo pipefail

source ~/.bashrc
conda activate bcftools-env

WD=~/Mytilus-WGS
REF=/home/local/ADS/lsh/Mytilus/prepped_genome/mytilus_californianus_genome.fasta
VCF=${WD}/snps/bcftools.popgen.strict.vcf.gz
MASK=${WD}/snps/callable_minus_snps.bed   # or callable_minus_cpg_snps.bed for methylation-aware
OUTPFX=${WD}/snps/snps_filtered_bt2
MU=8.6e-9

[[ -f "${REF}.fai" ]] || samtools faidx "$REF"
mkdir -p "${WD}/snps"

# Filter to callable, PASS SNPs, biallelic
bcftools view -R "$MASK" -f PASS -v snps -m2 -M2 -Oz -o "${OUTPFX}.vcf.gz" "$VCF"
tabix -p vcf "${OUTPFX}.vcf.gz"

# π (site + windowed)
vcftools --gzvcf "${OUTPFX}.vcf.gz" --site-pi --out "${OUTPFX}_site_pi"
vcftools --gzvcf "${OUTPFX}.vcf.gz" --window-pi 100000 --window-pi-step 100000 --out "${OUTPFX}_windowed_pi"

# Tajima’s D (windowed + one large window ≈ genome-wide)
vcftools --gzvcf "${OUTPFX}.vcf.gz" --TajimaD 100000 --out "${OUTPFX}_windowed_TajimaD" || true
EFF_LEN=$(awk '{s+=$3-$2}END{print s}' "$MASK")
vcftools --gzvcf "${OUTPFX}.vcf.gz" --TajimaD ${EFF_LEN} --out "${OUTPFX}_tajimaD" || true

# θ_Watterson via allele freqs
vcftools --gzvcf "${OUTPFX}.vcf.gz" --freq --out "${OUTPFX}_freq"

# Summaries
SUM_PI=$(awk 'NR>1{sum+=$3}END{printf("%.10f", sum+0)}' "${OUTPFX}_site_pi.sites.pi")
PI_GENOME=$(python3 - <<PY
eff=${EFF_LEN}; s=float("${SUM_PI}")
print(f"{s/eff:.10f}")
PY
)

SITES=$(grep -cv '^#' "${OUTPFX}_freq.frq")
N_SAMPLES=$(bcftools query -l "${OUTPFX}.vcf.gz" | wc -l)
N_CHR=$((2 * N_SAMPLES))
A_N=$(python3 - <<PY
n=${N_CHR}
print(sum(1.0/i for i in range(1,n)) if n>1 else float('nan'))
PY
)
THETA_RAW=$(python3 - <<PY
sites=${SITES}; a=${A_N}
print(f"{(sites/a):.10f}")
PY
)
THETA_NORM=$(python3 - <<PY
t=float("${THETA_RAW}"); eff=${EFF_LEN}
print(f"{t/eff:.12f}")
PY
)
D_AVG=$(awk 'NR>1 && $4!="."{sum+=$4;cnt++}END{if(cnt>0) printf("%.6f", sum/cnt); else print "NA"}' "${OUTPFX}_tajimaD.Tajima.D" 2>/dev/null || echo "NA")
NE=$(python3 - <<PY
pi=float("${PI_GENOME}"); mu=${MU}
print(f"{pi/(4*mu):.2f}")
PY
)

SUMMARY="${WD}/genomewide_diversity_summary.tsv"
if [[ ! -s "$SUMMARY" ]]; then
  echo -e "PREFIX\tN_SAMPLES\tSITES\tPI_genomewide\tTHETA_W_raw\tTHETA_W_norm\tTAJIMA_D_avg\tNE(mu=${MU})\tLEN_USED" > "$SUMMARY"
fi
echo -e "$(basename "${OUTPFX}")\t${N_SAMPLES}\t${SITES}\t${PI_GENOME}\t${THETA_RAW}\t${THETA_NORM}\t${D_AVG}\t${NE}\t${EFF_LEN}" >> "$SUMMARY"
column -t "$SUMMARY"