### Starling Project Santure lab group, lead Kat Stuart ###

### Kat's Project notes:
### PSMC requires whole genome data of one individual per PSMC track. For this you will need:
# Genome: /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/data/resources/genomes/Svulgaris_vAU_1.0.fasta
# Mapped WGS data: /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/data/mapped_reads/starling_wgs_mapped/
# Repeat bed file (for excluding repeat regions during var calling): /nesi/nobackup/uoa02613/kstuart_projects/Sv10_NZstarlings/data/repeat_analysis/Svulgaris_vAU_1.0.fasta_rm.bed

# Essentially the way the code works is that for each sample you chose you call all the variants for that individually, and then run a separate PSMC analysis on each individual's SNP data. 
# I would aim to run a few different samples, maybe (looking in the 'Mapped WGS data' directory listed above) pick 3 x UK samples, 3 from NZ (S_BAD samples -- mix male and female) and 3x from one Australian sampling location (maybe auX_mai and auX_men samples).

### Purpose of manuscript
# Invasion history - different patterns in NZ vs Australia/UK
# Demographic history, booms and busts and modelling to be done to test invasion history


### Subsampled individuals from each pop; /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/data/mapped_reads/starling_wgs_mapped/sub_sample
### Au mai
au03_mai
au04_mai
au27_mai
### Au men
au05_men
au06_men
au15_men
### NZ - S_BAD
S_BAD_001M
S_BAD_007F
S_BAD_008F
### UK 
uk04_nwc
uk06_nwc
uk07_nwc

#### STEP 1: Prepeare files for PSMC
# 1) Call consensus variants per individual and index output. This will also mask sites overlapping repeat annotations.
# 2) generate diploid sequences
# 3) Convert to psmcfa file format

#!/bin/bash -e
#SBATCH --job-name=psmc_prep_05Mar2024
#SBATCH --account=uoa02613
#SBATCH --time=12:00:00
#SBATCH --mem=10GB
#SBATCH --output=MCN_%j.errout
#SBATCH --mail-user=m.nehmens@massey.ac.nz
#SBATCH --mail-type=ALL
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --profile task
#SBATCH --array=1-12

SAMPLE=$(sed "${SLURM_ARRAY_TASK_ID}q;d" /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/psmc_indv_subset.txt)

module load psmc/0.6.5-gimkl-2018b
module load VCFtools/0.1.15-GCC-9.2.0-Perl-5.30.1
module load SAMtools/1.19-GCC-12.3.0
module load tabix/0.2.6-GCCcore-9.2.0
module load BCFtools/1.19-GCC-11.3.0
module load BBMap/39.01-GCC-11.3.0

cd /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/fastq

GENOME=/nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/data/resources/genomes/Svulgaris_vAU_1.0.fasta
BAM_DIR=/nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/data/mapped_reads/starling_wgs_mapped/sub_sample
REP_BED=/nesi/nobackup/uoa02613/kstuart_projects/Sv10_NZstarlings/data/repeat_analysis/Svulgaris_vAU_1.0.fasta_rm.bed

#1
bcftools mpileup -C 50 -q 20 -Q 25 -Ou -f ${GENOME} ${BAM_DIR}/${SAMPLE}.sorted.dup.bam | bcftools call -c -Oz -o ./${SAMPLE}.vcf.gz
bcftools filter --SnpGap 10 -i "DP>=5 & DP<=50" ./${SAMPLE}.vcf.gz -Oz -o ./${SAMPLE}.filter.vcf.gz
bcftools view --exclude-types indels ./${SAMPLE}.filter.vcf.gz -T ^${REP_BED} | bcftools sort --temp-dir ./tmp_${SAMPLE} -Oz -o ./${SAMPLE}.sort.vcf.gz
tabix -p vcf ./${SAMPLE}.sort.vcf.gz

#2
bcftools view ./${SAMPLE}.vcf.gz | vcfutils.pl vcf2fq | gzip > ./${SAMPLE}.fastq.gz

#3
cd /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/
fq2psmcfa -q 20 ./fastq/${SAMPLE}.fastq.gz > ./${SAMPLE}.psmcfa

#### Submitted as job 44205319 on 05 March 2024 - produced jobs: 44205321 44205322 44205323 44205324 44205325 44205326 44205327 44205328 44205329 44205330 44205331, due to array
# Job Wall-time:   76.3%  09:09:21 of 12:00:00 time limit
# CPU Efficiency: 146.3%  13:23:48 of 09:09:21 core-walltime
# Mem Efficiency:   7.1%  731.36 MB of 10.00 GB
# Can reduce time to 10hrs, bumb CPU to 4, and reduce memory to 3GB

### Step 2: Running PSMC from step 1 output
# Time segment patterns:
# 4+5*3+4 (default)
# 4+10*3+6+8
# 4+25*2+4+6
# 4+30*2+4+6+10 ### time segment pattern here was chosen from Nadachowska-Brzyska_etal_2015_Supplemental_Table_S1, and paper's test for all bird species for which this works

#!/bin/bash -e
#SBATCH --job-name=psmc_main_06Mar2024
#SBATCH --account=uoa02613
#SBATCH --time=00-12:00:00
#SBATCH --mem=2GB
#SBATCH --output=MCN_%j.errout
#SBATCH --mail-user=m.nehmens@massey.ac.nz
#SBATCH --mail-type=ALL
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --profile task
#SBATCH --array=1-12

SAMPLE=$(sed "${SLURM_ARRAY_TASK_ID}q;d" /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/psmc_indv_subset.txt)
echo "working with sample:" $SAMPLE

cd /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/

module load psmc/0.6.5-gimkl-2018b

psmc -N30 -t5 -r5 -p "4+30*2+4+6+10" -o ./results/${SAMPLE}.diploid.psmc ./${SAMPLE}.psmcfa

#### Submitted as job 44220855 on 06 March 2024 and then jobs 44220856  44220857  44220858  44220859  44220860  44220861  44220862  44220863  44220864  44220865  44220866
# Job Wall-time:   16.2%  01:56:50 of 12:00:00 time limit
# CPU Efficiency: 100.0%  01:56:49 of 01:56:50 core-walltime
# Mem Efficiency:  99.5%  1.99 GB of 2.00 GB
# Reduce time to 3hrs, CPU can stay at 2, increase memory to 3GB (just to be safe)

### Step 3: Bootstrapping PSMC

#!/bin/bash -e
#SBATCH --job-name=psmc_bootstrap_06Mar2024
#SBATCH --account=uoa02613
#SBATCH --time=200:00:00
#SBATCH --mem=10GB
#SBATCH --output=MCN_%j.errout
#SBATCH --mail-user=m.nehmens@massey.ac.nz
#SBATCH --mail-type=ALL
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --profile=task
#SBATCH --array=1-12
#SBATCH --partition=long

# need long partition due to run time, bigmem only runs for 7 days which it automatically selected because of resources. forcing partition still didn't like it, so Dini had to back end fix

SAMPLE=$(sed "${SLURM_ARRAY_TASK_ID}q;d" /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/psmc_indv_subset_bs1.txt)

cd /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/
module load psmc/0.6.5-gimkl-2018b 

splitfa ./${SAMPLE}.psmcfa > ./${SAMPLE}-split.psmcfa

seq 100 | xargs -i echo psmc -N30 -t5 -r5 -b -p "4+30*2+4+6+10" -o ./results/bootstrap/${SAMPLE}.round-{}.psmc ./${SAMPLE}-split.psmcfa | sh


##### Plotting PSMC +bootstrapping results #############

## need to cat all files to be plotted together
cat au03_mai.diploid.psmc au04_mai.diploid.psmc au27_mai.diploid.psmc au05_men.diploid.psmc au06_men.diploid.psmc au15_men.diploid.psmc S_BAD_001M.diploid.psmc S_BAD_007F.diploid.psmc S_BAD_008F.diploid.psmc uk04_nwc.diploid.psmc uk06_nwc.diploid.psmc uk07_nwc.diploid.psmc > combined_diploid.psmc

#!/bin/bash -e
#SBATCH --job-name=psmc_plot
#SBATCH --account=uoa02613
#SBATCH --time=1:00:00
#SBATCH --mem=4GB
#SBATCH --output=MCN_%j.errout
#SBATCH --mail-user=mneh623@aucklanduni.ac.nz
#SBATCH --mail-type=ALL
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --profile=task


cd /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/plots

module purge
module load psmc/0.6.5-gimkl-2018b

psmc_plot.pl -u 2.3e-09 -g 2 -M 'au03_mai, au04_mai, au27_mai, au05_men, au06_men, au15_men, S_BAD_001M, S_BAD_007F, S_BAD_008F, uk04_nwc, uk06_nwc, uk07_nwc' combined_psmc_plot /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/combined_diploid.psmc

perl epstopdf.pl combined_psmc_plot.eps

### concatenate bootstrap rounds (combined bootstrap = cmbdbs)
cat au03_mai.round*.psmc > au03_mai_cmbdbs.psmc
cat au04_mai.round*.psmc > au04_mai_cmbdbs.psmc
cat au27_mai.round*.psmc > au27_mai_cmbdbs.psmc
cat au05_men.round*.psmc > au05_men_cmbdbs.psmc
cat au06_men.round*.psmc > au06_men_cmbdbs.psmc
cat au15_men.round*.psmc > au15_men_cmbdbs.psmc
cat S_BAD_001M.round*.psmc > S_BAD_001M_cmbdbs.psmc
cat S_BAD_007F.round*.psmc > S_BAD_007F_cmbdbs.psmc
cat S_BAD_008F.round*.psmc > S_BAD_008F_cmbdbs.psmc
cat uk04_nwc.round*.psmc > uk04_nwc_cmbdbs.psmc
cat uk06_nwc.round*.psmc > uk06_nwc_cmbdbs.psmc
cat uk07_nwc.round*.psmc > uk07_nwc_cmbdbs.psmc

### Plot combined boostrap files 
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/au03_mai_combBS_plot au03_mai_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/au04_mai_combBS_plot au04_mai_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/au27_mai_combBS_plot au27_mai_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/au05_men_combBS_plot au05_men_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/au06_men_combBS_plot au06_men_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/au15_men_combBS_plot au15_men_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/S_BAD_001M_combBS_plot S_BAD_001M_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/S_BAD_007F_combBS_plot S_BAD_007F_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/S_BAD_008F_combBS_plot S_BAD_008F_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/uk04_nwc_combBS_plot uk04_nwc_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/uk06_nwc_combBS_plot uk06_nwc_cmbdbs.psmc
psmc_plot.pl -u 2.3e-09 -g 2 ../plots/uk07_nwc_combBS_plot uk07_nwc_cmbdbs.psmc

### Convert to PDF
au03_mai_combBS_plot.eps
au04_mai_combBS_plot.eps
au27_mai_combBS_plot.eps
au05_men_combBS_plot.eps
au06_men_combBS_plot.eps
au15_men_combBS_plot.eps
S_BAD_001M_combBS_plot.eps
S_BAD_007F_combBS_plot.eps
S_BAD_008F_combBS_plot.eps
uk04_nwc_combBS_plot.eps
uk06_nwc_combBS_plot.eps
uk07_nwc_combBS_plot.eps

perl epstopdf.pl au03_mai_combBS_plot.eps
perl epstopdf.pl au04_mai_combBS_plot.eps
perl epstopdf.pl au27_mai_combBS_plot.eps
perl epstopdf.pl au05_men_combBS_plot.eps
perl epstopdf.pl au06_men_combBS_plot.eps
perl epstopdf.pl au15_men_combBS_plot.eps
perl epstopdf.pl S_BAD_001M_combBS_plot.eps
perl epstopdf.pl S_BAD_007F_combBS_plot.eps
perl epstopdf.pl S_BAD_008F_combBS_plot.eps
perl epstopdf.pl uk04_nwc_combBS_plot.eps
perl epstopdf.pl uk06_nwc_combBS_plot.eps
perl epstopdf.pl uk07_nwc_combBS_plot.eps

#### Combine concatenated boostrap rounds with original, just playing here 
cat  au03_mai.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/au03_mai_cmbdbs.psmc > au03_mai_OGBS.psmc
cat  au04_mai.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/au04_mai_cmbdbs.psmc > au04_mai_OGBS.psmc
cat  au27_mai.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/au27_mai_cmbdbs.psmc > au27_mai_OGBS.psmc
cat  au05_men.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/au05_men_cmbdbs.psmc > au05_men_OGBS.psmc
cat  au06_men.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/au06_men_cmbdbs.psmc > au06_men_OGBS.psmc
cat  au15_men.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/au15_men_cmbdbs.psmc > au15_men_OGBS.psmc
cat  S_BAD_001M.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/S_BAD_001M_cmbdbs.psmc > S_BAD_001M_OGBS.psmc
cat  S_BAD_007F.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/S_BAD_007F_cmbdbs.psmc > S_BAD_007F_OGBS.psmc
cat  S_BAD_008F.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/S_BAD_008F_cmbdbs.psmc > S_BAD_008F_OGBS.psmc
cat  uk04_nwc.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/uk04_nwc_cmbdbs.psmc > uk04_nwc_OGBS.psmc
cat  uk06_nwc.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/uk06_nwc_cmbdbs.psmc > uk06_nwc_OGBS.psmc
cat  uk07_nwc.diploid.psmc /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results/bootstrap/uk07_nwc_cmbdbs.psmc > uk07_nwc_OGBS.psmc

### Plot main PSMC and bootstrap -- dark red line is now main PSMC in the combined plot for all locations
psmc_plot.pl -u 2.3e-09 -g 2 au03_mai_OGBS_plot ../au03_mai_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 2 au04_mai_OGBS_plot ../au04_mai_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 3 au27_mai_OGBS_plot ../au27_mai_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 4 au05_men_OGBS_plot ../au05_men_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 5 au06_men_OGBS_plot ../au06_men_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 6 au15_men_OGBS_plot ../au15_men_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 7 S_BAD_001M_OGBS_plot ../S_BAD_001M_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 8 S_BAD_007F_OGBS_plot ../S_BAD_007F_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 9 S_BAD_008F_OGBS_plot ../S_BAD_008F_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 10 uk04_nwc_OGBS_plot ../uk04_nwc_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 11 uk06_nwc_OGBS_plot ../uk06_nwc_OGBS.psmc
psmc_plot.pl -u 2.3e-09 -g 12 uk07_nwc_OGBS_plot ../uk07_nwc_OGBS.psmc

### Convert to PDF
perl epstopdf.pl au03_mai_OGBS_plot.eps
perl epstopdf.pl au04_mai_OGBS_plot.eps
perl epstopdf.pl au27_mai_OGBS_plot.eps
perl epstopdf.pl au05_men_OGBS_plot.eps
perl epstopdf.pl au06_men_OGBS_plot.eps
perl epstopdf.pl au15_men_OGBS_plot.eps
perl epstopdf.pl S_BAD_001M_OGBS_plot.eps
perl epstopdf.pl S_BAD_007F_OGBS_plot.eps
perl epstopdf.pl S_BAD_008F_OGBS_plot.eps
perl epstopdf.pl uk04_nwc_OGBS_plot.eps
perl epstopdf.pl uk06_nwc_OGBS_plot.eps
perl epstopdf.pl uk07_nwc_OGBS_plot.eps



### Naming key found in /nesi/nobackup/uoa02613/kstuart_projects/At4_MynaStarling/analysis/PSMC/results (any files used prior to this are in directory PSMC, including fastq files)
*.sh are my SLURM scripts (did interactive for boostrapping once rounds were created, I wanted to make sure each step did what I wanted it to- didn't take extra time)
*.diploid.psmc are the original PSMC files (/results)
combined_diploid.psmc combined *.diploid.psmc files (/results)
combined_psmc_plot.* the plotted files of the combined_diploid.psmc (./plots)
*_cmbdbs.psmc are the 'combined bootstrap' rounds by individaul (./bootstrap)
*_combBS_plot.* are the plotted combined bootstrap rounds (./plots)
*_OGBS.psmc are the original PSMC (OG) and bootstrap (BS) PSMC files concatenated together (/results)
*_OGBS_plot.* are the plotted original PSMC and boostrap (./plots)
