-
Notifications
You must be signed in to change notification settings - Fork 0
/
Update.sh
144 lines (112 loc) · 7.93 KB
/
Update.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
dataset=$1
ref_seq=$2 # Ncommune_rbcX.fa
clade=$(#echo $dataset | cut -d_ -f 1)
locus=$(#echo $dataset | cut -d_ -f 2)
cur_date=$(date +%y%m%d)
ref_seq=$(pwd)/$ref_seq
log_file=~/Documents/PhotobiontDiversity/$cur_date/log.txt
exec > $log_file
set -x
#echo $ref_seq
#create folder for current post and for log of commands
if ! test -d ~/Documents/PhotobiontDiversity/$cur_date
then
mkdir ~/Documents/PhotobiontDiversity/$cur_date
fi
#create folder if not present
if ! test -d ~/Documents/PhotobiontDiversity/$dataset
then
#echo "`date`: mkdir ~/Documents/PhotobiontDiversity/$dataset" >> .$log_file
mkdir ~/Documents/PhotobiontDiversity/$dataset
fi
#echo "`date`: cd ~/Documents/PhotobiontDiversity/$dataset" >> $log_file
cd ~/Documents/PhotobiontDiversity/$dataset
#run blast (need all sequences even if updating because there may be new redundant sequences)
#echo "`date`: blastn -query $ref_seq -db nt -evalue 1e-180 -max_target_seqs 3000 -out ${dataset}.bl -outfmt '6 qseqid qlen sacc slen pident length mismatch gapopen qstart qend qframe sstart send sframe evalue bitscore'" >> $log_file
blastn -query $ref_seq -db nt -evalue 1e-100 -max_target_seqs 3000 -out ${dataset}.bl -outfmt '6 qseqid qlen sacc slen pident length mismatch gapopen qstart qend qframe sstart send sframe evalue bitscore'
#get all sequences that are on the correct strand and not part of a larger sequence
#echo "`date`: awk '{if($12<$13) print $0}' ${dataset}.bl | awk '{if ($4 < 3000) print $0}' |cut -f3 | sort | uniq > ${dataset}_acc.txt" >> $log_file
awk '{if($12<$13) print $0}' ${dataset}.bl | awk '{if ($4 < 3000) print $0}' |cut -f3 | sort | uniq > ${dataset}_acc.txt
#echo "`date`: blastdbcmd -db nt -entry_batch ${dataset}_acc.txt |../Scripts/GetRedundant.pl >${dataset}_all.fa" >> $log_file
blastdbcmd -db nt -entry_batch ${dataset}_acc.txt |../Scripts/GetRedundant.pl >${dataset}_all.fa
#get negative strand sequences that are not part of a larger sequence
#echo "`date`: awk '{if($12>$13) print $0}' ${dataset}.bl | awk '{if ($4 < 3000) print $0}' |cut -f3 | sort | uniq > ${dataset}_acc_rc.txt" >> $log_file
awk '{if($12>$13) print $0}' ${dataset}.bl | awk '{if ($4 < 3000) print $0}' |cut -f3 | sort | uniq > ${dataset}_acc_rc.txt
if test -s ${dataset}_acc_rc.txt
then
#echo "`date`: blastdbcmd -db nt -entry_batch ${dataset}_acc_rc.txt |../Scripts/GetRedundant.pl | ../Scripts/rc.pl >>${dataset}_all.fa" >> $log_file
blastdbcmd -db nt -entry_batch ${dataset}_acc_rc.txt |../Scripts/GetRedundant.pl | ../Scripts/rc.pl >>${dataset}_all.fa
else
#echo "`date`: rm ${dataset}_acc_rc.txt" >> $log_file
rm ${dataset}_acc_rc.txt
fi
#echo "`date`: awk '{if ($4 > 3000) print $0}' ${dataset}.bl > ${dataset}_long.bl" >> $log_file
awk '{if ($4 > 3000) print $0}' ${dataset}.bl > ${dataset}_long.bl
if test -s ${dataset}_long.bl
then
#echo "`date`: cut -f3 ${dataset}_long.bl | sort | uniq > ${dataset}_acc_long.txt" >> $log_file
cut -f3 ${dataset}_long.bl | sort | uniq > ${dataset}_acc_long.txt
#echo "`date`: blastdbcmd -db nt -entry_batch ${dataset}_acc_long.txt > ${dataset}_all_long.fa" >> $log_file
blastdbcmd -db nt -entry_batch ${dataset}_acc_long.txt > ${dataset}_all_long.fa
#echo "`date`: python ../Scripts/ExtractHitRegion.py ${dataset}_all_long.fa ${dataset}_long.bl >>${dataset}_all.fa" >> $log_file
python ../Scripts/ExtractHitRegion.py ${dataset}_all_long.fa ${dataset}_long.bl >>${dataset}_all.fa
else
#echo "`date`: rm ${dataset}_long.bl" >> $log_file
rm ${dataset}_long.bl
fi
#print number of sequences
#echo "`date`: wc -l ${dataset}_acc.txt" >> $log_file
wc -l ${dataset}_acc.txt
#filter out sequences that are already in the DB
#echo "`date`: python ../Scripts/GetNew.py ${dataset}_all.fa >${dataset}_new.fa" >> $log_file
python ../Scripts/GetNew.py ${dataset}_all.fa | perl -p -e 's/^>\s*gi\|\d+\|\w+\|(\w+)\.\d\|.*/>$1/' >${dataset}_new.fa
#Download genbank sequences and parse metadata and update DB
#echo "`date`: grep '>' ${dataset}_new.fa | perl -p -e 's/>//' | python ../Scripts/GetGB.py >${dataset}_new.gb" >> $log_file
grep '>' ${dataset}_new.fa | perl -p -e 's/>//' | python ../Scripts/GetGB.py >${dataset}_new.gb
#echo "`date`: ../Scripts/ParseHost.pl ${dataset}_new.gb $clade ${locus} ${cur_date} >${dataset}_metadata_new.txt" >> $log_file
../Scripts/ParseHost.pl ${dataset}_new.gb $clade ${locus} ${cur_date} >${dataset}_metadata_new.txt
#echo "`date`: python ../Scripts/UpdateDB.py ${dataset}_metadata_new.txt" >> ../$cur_date/log.txt
python ../Scripts/UpdateDB.py ${dataset}_metadata_new.txt
#Add new data to master datasets
#echo "`date`: cat ${dataset}_new.fa >> ${dataset}.fa" >> ../$cur_date/log.txt
cat ${dataset}_new.fa >> ${dataset}.fa
#echo "`date`:${dataset}_metadata_new.txt >> ${dataset}_metadata.txt" >> ../$cur_date/log.txt
cat ${dataset}_metadata_new.txt >> ${dataset}_metadata.txt
#echo "`date`: cat ${dataset}_new.gb >> ${dataset}.gb" >> ../$cur_date/log.txt
cat ${dataset}_new.gb >> ${dataset}.gb
#cluster sequences and assign groups
#echo "`date`: usearch -cluster_fast ${dataset}.fa -id 1 -centroids ${dataset}_nr.fa -uc ${dataset}_groups.txt" >> ../$cur_date/log.txt
usearch -cluster_fast ${dataset}.fa -id 1 -centroids ${dataset}_nr.fa -uc ${dataset}_groups.txt
#echo "`date`: python ../Scripts/GetGroups.py -g ${dataset}_groups.txt -l ${locus}" >> ../$cur_date/log.txt
python ../Scripts/GetGroups.py -g ${dataset}_groups.txt -l ${locus}
#create alignment and make tree, mapping on metadata
#echo "`date`: mafft ${dataset}_nr.fa >${dataset}_aln.fa" >> ../$cur_date/log.txt
mafft ${dataset}_nr.fa >${dataset}_aln.fa
#echo "`date`: exset = ../Scripts/GetExcluded.pl ${dataset}_aln.fa" >> ../$cur_date/log.txt
exset=$(../Scripts/GetExcluded.pl ${dataset}_aln.fa)
#echo "`date`: trimal -in ${dataset}_aln.fa -phylip -select $exset >${dataset}.phy" >> ../$cur_date/log.txt
trimal -in ${dataset}_aln.fa -phylip -select $exset >${dataset}.phy
#echo "`date`: phyml --quiet --no_memory_check -i ${dataset}.phy" >> ../$cur_date/log.txt
phyml --quiet --no_memory_check -i ${dataset}.phy
#echo "`date`: mv ${dataset}.phy_phyml_tree.txt ${dataset}.nwk" >> ../$cur_date/log.txt
mv ${dataset}.phy_phyml_tree.txt ${dataset}.nwk
mv ${dataset}.phy_phyml_stats.txt ${dataset}_phyml_stats.txt
if test $dataset == 'Trebouxia_ITS'
then
#echo "`date`: python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -f species -o ${dataset}.svg" >> ../$cur_date/log.txt
python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -f species -o ${dataset}.svg
#echo "`date`: python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -f species -o ${dataset}.pdf" >> ../$cur_date/log.txt
python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -f species -o ${dataset}.pdf
else
#echo "`date`: python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -o ${dataset}.svg" >> ../$cur_date/log.txt
python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -o ${dataset}.svg
#echo "`date`: python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -o ${dataset}.pdf" >> ../$cur_date/log.txt
python ../Scripts/FormatTree.py -t ${dataset}.nwk -l $locus -d $cur_date -o ${dataset}.pdf
fi
#This is commented out because it's probably a good idea to take a look at the tree before running it
#python ../Scripts/UpdateClades.py -t ${dataset}.nwk -l $locus
python ../Scripts/CountAssociations -t ${dataset}_associations.csv -c ${dataset}_associations.css
awk '{total = total + $3}END{print total}' ${dataset}_associations.csv
#copy new files to current post folder
#echo "`date`: cp ${dataset}.pdf ${dataset}.svg ${dataset}.nwk ${dataset}.phy ${dataset}_aln.fa ${dataset}_nr.fa ${dataset}_new.fa ${dataset}_metadata_new.txt ${dataset}_new.gb ../$cur_date" >> ../$cur_date/log.txt
cp ${dataset}.pdf ${dataset}.svg ${dataset}.nwk ${dataset}_phyml_stats.txt ${dataset}.phy ${dataset}_aln.fa ${dataset}_groups.txt ${dataset}_nr.fa ${dataset}_new.fa ${dataset}_metadata_new.txt ${dataset}_new.gb ${dataset}.csv ${dataset}.css ../$cur_date