-
Notifications
You must be signed in to change notification settings - Fork 0
/
step7_correct_window_estimates.sh
75 lines (55 loc) · 4.36 KB
/
step7_correct_window_estimates.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
#SBATCH --job-name=S7
#SBATCH --output=S7_%j.log
### RUN AS: script.sh KB_WIN_SIZE_INTEGER
WIN=$1 ### window size in kb
REF="reference_genome_index.fasta.fai"
### GOAL: Correct estimate of nucleotide diversity (here, watterson's theta, equivalent to pi) based on the number of variant (vs. invariant) sites per window)
### And get a window-wise average
module load linuxbrew/colsa ### Varies depending on where your software is installed
### Make windows for entire genome - divide length of each chromosome by window size and make that many windows in tab delimited output file
bedtools makewindows -g $REF -w ${WIN}000 | awk '$3 ~ /000$/' | sed 's/ /\t/g'> genome_windows_${WIN}k.bed
#Output (1k windows across all chr): genome_windows_1k.bed
#chr1 0 1000
#chr1 1000 2000
#chr1 2000 300
### Remove the header:
tail -n +2 angsd_pecr_Win.thetas.logSites.idx > angsd_pecr_Win.thetas.logSites_nohead
#-n +K = to write output starting with the Kth line (here the 2nd line)
### Extract the 2nd field (e.g., the 1st col) and write to new file
cut -f2 angsd_pecr_Win.thetas.logSites_nohead | awk '{$1 = $1 + 1; print}' | paste angsd_pecr_Win.thetas.logSites_nohead - | awk 'BEGIN {FS="\t"};{ print $1"\t"$2"\t"$8"\t"$4}' | sed 's/ //g' > pi_pecr_global.bed
### NOW, loop through all chromosomes (names listed 1 per line in chromosomes.txt)
for CHR in `cat chromosomes.txt`; do
###make bed file for all variant and invariant sites for each chromosome
grep "${CHR}$(printf '\t')" angsd_pecr_allvar.mafs > pecr_allvar_${CHR}.mafs
### Extract fields 1 and 2 and put them into a new file
cut -f 1,2 pecr_allvar_${CHR}.mafs > pecr_allvar.sites_${CHR}.txt
### Extract only the second field, add 1 to it (possibly due to 0 vs. 1 base indexing)
cut -f2 pecr_allvar.sites_${CHR}.txt | awk '{$1 = $1 + 1; print}' | paste pecr_allvar.sites_${CHR}.txt - | sed 's/ //g'> pecr_allvar.sites_${CHR}.bed
# first_col +1 (and add this as a third column)
### Split the genome window file into chromosomes
grep "${CHR}$(printf '\t')" genome_windows_${WIN}k.bed > genome_windows_${WIN}k_${CHR}.bed
### Calculate the number of sites in each window for each chromosome
bedtools coverage -a genome_windows_${WIN}k_${CHR}.bed -b pecr_allvar.sites_${CHR}.bed -counts > pecr_allsites_${WIN}kbwin_${CHR}.txt
# Adds a column of coverage/depth
### Extract the 4th field and replace 0's with NA's
cut -f4 pecr_allsites_${WIN}kbwin_${CHR}.txt | sed 's/^0/NA/g' > pecr_allsites_${WIN}kwin_NA_${CHR}.txt
### Split the per site theta file
grep "${CHR}$(printf '\t')" pi_pecr_global.bed > pi_pecr_global_${CHR}.bed
### e to the X with X being the values in col4 (e.g., Watterson's theta estimates per site)
awk '{print exp($4)}' pi_pecr_global_${CHR}.bed | paste pi_pecr_global_${CHR}.bed - > pi_pecr_global_log_${CHR}.bed
bedtools map -a genome_windows_${WIN}k_${CHR}.bed -b pi_pecr_global_log_${CHR}.bed -c 5 -o sum | sed 's/\t[.]/\tNA/g' - > pi_pecr_global_log_${WIN}kbwin_${CHR}.txt
# Values from B (col 5 -> exp(Watterson)) mapped onto intervales in A and removing NAs
# 5th column is a the number of variable sites
### If it's an NA do nothing, otherwise divide col_4/col_5 (that is the sum of exp(Wattersons) divided by the total number of variant sites within the interval)
### to get an average value of Watterson's (measure of nucleotide diversity equivalent to pi) for a given window
paste pi_pecr_global_log_${WIN}kbwin_${CHR}.txt pecr_allsites_${WIN}kwin_NA_${CHR}.txt | sed 's/[.]\t/NA\t/g' - > pi_pecr_global_log_${WIN}kbwin_sites_${CHR}.txt
awk '{if(/NA/)var="NA";else var=$4/$5;print var}' pi_pecr_global_log_${WIN}kbwin_sites_${CHR}.txt | paste pi_pecr_global_log_${WIN}kbwin_sites_${CHR}.txt - > pi_pecr_global_log_${WIN}kbwin_sites_corrected_${CHR}.txt
done
#OUTPUT LOOKS LIKE THIS (without header)
#chr start stop SUM(exp(WatTheta)) #varSites_perWin (SUM/#varSites)
#chr23 0 1000 222.144159 927 0.239638
#chr23 1000 2000 219.912544 894 0.245987
#chr23 2000 3000 224.832897 873 0.257541
#chr23 3000 4000 227.845044 929 0.245258
#chr23 4000 5000 206.18176 944 0.218413