forked from baudrly/hicstuff
-
Notifications
You must be signed in to change notification settings - Fork 7
/
pipeline.py
1200 lines (1095 loc) · 43.3 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Handle generation of graal-compatible contact maps from fastq files.
cmdoret, 20190322
"""
import os, time, csv, sys, re
from datetime import datetime
from dateutil.relativedelta import relativedelta
import shutil as st
import itertools
from shutil import which
import logging
from os.path import join
import subprocess as sp
import gzip
from Bio import SeqIO
import cooler
import pandas as pd
import numpy as np
import pysam as ps
import hicstuff.digest as hcd
import hicstuff.iteralign as hci
import hicstuff.filter as hcf
import hicstuff.io as hio
import hicstuff.distance_law as hcdl
import hicstuff.cutsite as hcc
import hicstuff.stats as hcs
import matplotlib
import pathlib
from hicstuff.version import __version__
import hicstuff.log as hcl
from hicstuff.log import logger
def align_reads(
reads,
genome,
out_bam,
tmp_dir=None,
threads=1,
aligner="bowtie2",
iterative=False,
min_qual=30,
read_len=None,
):
"""
Select and call correct alignment method and generate logs accordingly.
Alignments are filtered so that there at most one alignment per read.
Alignments are sorted by read name in the output file.
Parameters
----------
reads : str
Path to the fastq file with Hi-C reads.
genome : str
Path to the genome bowtie2/bwa index prefix if using bowtie2 or bwa, or to the
fasta if using minimap2.
out_bam : str
Path to the output BAM file containing mapped Hi-C reads.
tmp_dir : str
Path where temporary files are stored.
threads : int
Number of threads to run alignments in parallel.
aligner : bool
Use minimap2 or bwa instead of bowtie2.
iterative : bool
Wether to use the iterative mapping procedure (truncating reads and
extending iteratively)
min_qual : int
Minimum mapping quality required to keep an alignment during iterative
mapping.
read_len : int
Maximum read length to expect in the fastq file. Optionally used in iterative
alignment mode. Estimated from the first read by default. Useful if input fastq
is a composite of different read lengths.
"""
if tmp_dir is None:
tmp_dir = os.getcwd()
tmp_bam = out_bam + ".tmp"
if iterative:
iter_tmp_dir = hio.generate_temp_dir(tmp_dir)
hci.iterative_align(
reads,
tmp_dir=iter_tmp_dir,
ref=genome,
n_cpu=threads,
bam_out=tmp_bam,
min_qual=min_qual,
aligner=aligner,
read_len=read_len,
)
st.rmtree(iter_tmp_dir)
sp.call(
"samtools view -F 2048 -h -@ {threads} -O BAM {tmp} -o {out}".format(
threads=threads, tmp=tmp_bam, out=out_bam
),
shell=True,
)
else:
if aligner == "minimap2":
map_cmd = "minimap2 -2 -t {threads} -ax sr {fasta} {fastq} > {sam}"
elif aligner == "bwa":
map_cmd = "bwa mem -t {threads} -v 1 {index} {fastq} > {sam}"
else:
map_cmd = "bowtie2 --very-sensitive-local -p {threads} -x {index} -U {fastq} > {sam}"
map_args = {
"threads": threads,
"sam": tmp_bam,
"fastq": reads,
"fasta": genome,
"index": genome,
}
sp.call(map_cmd.format(**map_args), shell=True)
# Remove supplementary alignments and sort reads by name
sp.call("samtools view -F 2048 -h -@ {threads} {tmp} | samtools sort -n -@ {threads} -o {out} -".format(
tmp=tmp_bam, threads=threads, out=out_bam
),
shell=True,
)
os.remove(tmp_bam)
def bam2pairs(bam1, bam2, out_pairs, info_contigs, min_qual=30):
"""
Make a .pairs file from two Hi-C bam files sorted by read names.
The Hi-C mates are matched by read identifier. Pairs where at least one
reads maps with MAPQ below min_qual threshold are discarded. Pairs are
sorted by readID and stored in upper triangle (first pair higher).
Parameters
----------
bam1 : str
Path to the name-sorted BAM file with aligned Hi-C forward reads.
bam2 : str
Path to the name-sorted BAM file with aligned Hi-C reverse reads.
out_pairs : str
Path to the output space-separated .pairs file with columns
readID, chr1 pos1 chr2 pos2 strand1 strand2
info_contigs : str
Path to the info contigs file, to get info on chromosome sizes and order.
min_qual : int
Minimum mapping quality required to keep a Hi-C pair.
"""
forward = ps.AlignmentFile(bam1, "rb")
reverse = ps.AlignmentFile(bam2, "rb")
# Generate header lines
format_version = "## pairs format v1.0\n"
sorting = "#sorted: readID\n"
cols = "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n"
# Chromosome order will be identical in info_contigs and pair files
chroms = pd.read_csv(info_contigs, sep="\t").apply(
lambda x: "#chromsize: %s %d\n" % (x.contig, x.length), axis=1
)
with open(out_pairs, "w") as pairs:
pairs.writelines([format_version, sorting, cols] + chroms.tolist())
pairs_writer = csv.writer(pairs, delimiter="\t")
n_reads = {"total": 0, "mapped": 0}
# Remember if some read IDs were missing from either file
unmatched_reads = 0
# Remember if all reads in one bam file have been read
exhausted = [False, False]
# Iterate on both BAM simultaneously
end_regex = re.compile(r'/[12]$')
for end1, end2 in itertools.zip_longest(forward, reverse):
# Remove end-specific suffix if any
end1.query_name = re.sub(end_regex, '', end1.query_name)
end2.query_name = re.sub(end_regex, '', end2.query_name)
# Both file still have reads
# Check if reads pass filter
try:
end1_passed = end1.mapping_quality >= min_qual
# Happens if end1 bam file has been exhausted
except AttributeError:
exhausted[0] = True
end1_passed = False
try:
end2_passed = end2.mapping_quality >= min_qual
# Happens if end2 bam file has been exhausted
except AttributeError:
exhausted[1] = True
end2_passed = False
# Skip read if mate is not present until they match or reads
# have been exhausted
while sum(exhausted) == 0 and end1.query_name != end2.query_name:
# Get next read and check filters again
# Count single-read iteration
unmatched_reads += 1
n_reads["total"] += 1
if end1.query_name < end2.query_name:
try:
end1 = next(forward)
end1_passed = end1.mapping_quality >= min_qual
# If EOF is reached in BAM 1
except (StopIteration, AttributeError):
exhausted[0] = True
end1_passed = False
n_reads["mapped"] += end1_passed
elif end1.query_name > end2.query_name:
try:
end2 = next(reverse)
end2_passed = end2.mapping_quality >= min_qual
# If EOF is reached in BAM 2
except (StopIteration, AttributeError):
exhausted[1] = True
end2_passed = False
n_reads["mapped"] += end2_passed
# 2 reads processed per iteration, unless one file is exhausted
n_reads["total"] += 2 - sum(exhausted)
n_reads["mapped"] += sum([end1_passed, end2_passed])
# Keep only pairs where both reads have good quality
if end1_passed and end2_passed:
# Flipping to get upper triangle
if (
end1.reference_id == end2.reference_id
and end1.reference_start > end2.reference_start
) or end1.reference_id > end2.reference_id:
end1, end2 = end2, end1
pairs_writer.writerow(
[
end1.query_name,
end1.reference_name,
end1.reference_start + 1,
end2.reference_name,
end2.reference_start + 1,
"-" if end1.is_reverse else "+",
"-" if end2.is_reverse else "+",
]
)
pairs.close()
if unmatched_reads > 0:
logger.warning(
"%d reads were only present in one BAM file. Make sure you sorted reads by name before running the pipeline.",
unmatched_reads,
)
logger.info(
"{perc_map}% reads (single ends) mapped with Q >= {qual} ({mapped}/{total})".format(
total=n_reads["total"],
mapped=n_reads["mapped"],
perc_map=round(100 * n_reads["mapped"] / n_reads["total"]),
qual=min_qual,
)
)
def generate_log_header(log_path, input1, input2, genome, enzyme):
hcl.set_file_handler(log_path, formatter=logging.Formatter(""))
logger.info("## hicstuff: v%s log file", __version__)
logger.info("## date: %s", time.strftime("%Y-%m-%d %H:%M:%S"))
logger.info("## enzyme: %s", str(enzyme))
logger.info("## input1: %s ", input1)
logger.info("## input2: %s", input2)
logger.info("## ref: %s", genome)
logger.info("---")
hcl.set_file_handler(log_path, formatter=hcl.logfile_formatter)
def filter_pcr_dup(pairs_idx_file, filtered_file):
"""
Filter out PCR duplicates from a coordinate-sorted pairs file using
overrrepresented exact coordinates. If multiple fragments have two reads
with the exact same coordinates, only one of those fragments is kept.
Parameters
----------
pairs_idx_file : str
Path to an indexed pairs file containing the Hi-C reads.
filtered_file : str
Path to the output pairs file after removing duplicates.
"""
# Keep count of how many reads are filtered
filter_count = 0
reads_count = 0
# Store header lines
header = hio.get_pairs_header(pairs_idx_file)
with open(pairs_idx_file, "r") as pairs, open(filtered_file, "w") as filtered:
# Copy header lines to filtered file
for head_line in header:
filtered.write(head_line + "\n")
next(pairs)
# Use csv methods to easily access columns
paircols = [
"readID",
"chr1",
"pos1",
"chr2",
"pos2",
"strand1",
"strand2",
"frag1",
"frag2",
]
# Columns used for comparison of coordinates
coord_cols = [col for col in paircols if col != "readID"]
pair_reader = csv.DictReader(pairs, delimiter="\t", fieldnames=paircols)
filt_writer = csv.DictWriter(filtered, delimiter="\t", fieldnames=paircols)
# Initialise a variable to store coordinates of reads in previous pair
prev = {k: 0 for k in paircols}
for pair in pair_reader:
reads_count += 1
# If coordinates are the same as before, skip pair
if all(pair[pair_var] == prev[pair_var] for pair_var in coord_cols):
filter_count += 1
continue
# Else write pair and store new coordinates as previous
else:
filt_writer.writerow(pair)
prev = pair
logger.info(
"%d%% PCR duplicates have been filtered out (%d / %d pairs) "
% (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
)
logger.info(
"%d pairs remaining after removing PCR duplicates", reads_count - filter_count
)
def pairs2cool(pairs_file, cool_file, bins_file, exclude):
"""
Make a cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
bins_file : str
Path to the file containing genomic segmentation information. (fragments_list.txt).
"""
# Exclude some chromosomes from bins
bins_tmp = bins_file + ".cooler"
bins = pd.read_csv(bins_file, sep="\t", usecols=[1, 2, 3], skiprows=1, header=None)
if exclude is not None:
bins = bins[~bins[1].isin(exclude.split(','))]
bins.to_csv(bins_tmp, sep="\t", header=False, index=False)
# Make cool
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5 {bins} {pairs} {cool}"
cool_args = {"bins": bins_tmp, "pairs": pairs_file, "cool": cool_file}
sp.call(cooler_cmd.format(**cool_args), shell=True)
os.remove(bins_tmp)
def pairs2binnedcool(pairs_file, cool_file, binning, info_contigs, exclude):
"""
Make a *binned* cooler file from the pairs file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
pairs_file : str
Path to the pairs file containing input contact data.
cool_file : str
Path to the output cool file name to generate.
binning : int
If mat_fmt is set to "cool", the cool file will be further binned to
this resolution.
info_contigs : pathlib.Path or str
The path to the contigs info file
"""
# Exclude some chromosomes from bins
chroms_tmp = info_contigs + ".chroms"
chroms = pd.read_csv(info_contigs, sep="\t", usecols=[0, 1], skiprows=1, header=None)
if exclude is not None:
chroms = chroms[~chroms[0].isin(exclude.split(','))]
chroms.to_csv(chroms_tmp, index=False, sep='\t', header=False)
# Run `cooler cload pairs`
cooler_cmd = "cooler cload pairs -c1 2 -p1 3 -p2 4 -c2 5".split(" ")
sp.call(cooler_cmd + [chroms_tmp+":"+str(binning), pairs_file, cool_file], shell=False)
os.remove(chroms_tmp)
def cool2mcool(cool_file, output_file):
"""
Zoomify a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
cool_file : str
Path to an existing binned .cool file.
output_file : str
Path to the new multi-resolution .mcool file.
"""
# Get resolutions
clr = cooler.Cooler(cool_file)
res = clr.binsize
multires = [res, res*2, res*5, res*10, res*20, res*50, res*100]
# Run cooler zoomify
cooler.zoomify_cooler(clr.filename, output_file, multires, chunksize=10000000)
def balance(cool_file, balancing_args):
"""
Balance a *binned* cooler file. See: https://github.com/mirnylab/cooler/ for more informations.
Parameters
----------
cool_file : str
Path to an existing binned .(m)cool file.
balancing_args : str
Extra rguments passed to `cooler balance` (default: None)
"""
# Balance
if (cooler.fileops.is_multires_file(cool_file)):
paths = cooler.fileops.list_coolers(cool_file)
for path in paths:
cooler_cmd = "cooler balance".split(" ")
if balancing_args is not None:
sp.call(cooler_cmd + balancing_args.split(" ") + [cool_file+"::"+path], shell=False)
else:
sp.call(cooler_cmd + [cool_file+"::"+path], shell=False)
else:
cooler_cmd = "cooler balance".split(" ")
if balancing_args is not None:
sp.call(cooler_cmd + balancing_args.split(" ") + [cool_file], shell=False)
else:
sp.call(cooler_cmd + [cool_file], shell=False)
def pairs2matrix(
pairs_file, mat_file, fragments_file, mat_fmt="graal", threads=1, tmp_dir=None
):
"""Generate the matrix by counting the number of occurences of each
combination of restriction fragments in a pairs file.
Parameters
----------
pairs_file : str
Path to a Hi-C pairs file, with frag1 and frag2 columns added.
mat_file : str
Path where the matrix will be written.
fragments_file : str
Path to the fragments_list.txt file. Used to know total
matrix size in case some observations are not observed at the end.
mat_fmt : str
The format to use when writing the matrix. Can be graal or bg2 format.
threads : int
Number of threads to use in parallel.
tmp_dir : str
Temporary directory for sorting files. If None given, will use the system default.
"""
# Number of fragments is N lines in frag list - 1 for the header
n_frags = sum(1 for line in open(fragments_file, "r")) - 1
frags = pd.read_csv(fragments_file, delimiter="\t")
def write_mat_entry(frag1, frag2, contacts):
"""Write a single sparse matrix entry in either graal or bg2 format"""
if mat_fmt == "graal":
mat.write("\t".join(map(str, [frag1, frag2, n_occ])) + "\n")
elif mat_fmt == "bg2":
frag1, frag2 = int(frag1), int(frag2)
mat.write(
"\t".join(
map(
str,
[
frags.chrom[frag1],
frags.start_pos[frag1],
frags.end_pos[frag1],
frags.chrom[frag2],
frags.start_pos[frag2],
frags.end_pos[frag2],
contacts,
],
)
)
+ "\n"
)
pre_mat_file = mat_file + ".pre.pairs"
hio.sort_pairs(
pairs_file,
pre_mat_file,
keys=["frag1", "frag2"],
threads=threads,
tmp_dir=tmp_dir,
)
header_size = len(hio.get_pairs_header(pre_mat_file))
with open(pre_mat_file, "r") as pairs, open(mat_file, "w") as mat:
# Skip header lines
for _ in range(header_size):
next(pairs)
prev_pair = ["0", "0"] # Pairs identified by [frag1, frag2]
n_occ = 0 # Number of occurences of each frag combination
n_nonzero = 0 # Total number of nonzero matrix entries
n_pairs = 0 # Total number of pairs entered in the matrix
pairs_reader = csv.reader(pairs, delimiter="\t")
# First line contains nrows, ncols and number of nonzero entries.
# Number of nonzero entries is unknown for now
if mat_fmt == "graal":
mat.write("\t".join(map(str, [n_frags, n_frags, "-"])) + "\n")
for pair in pairs_reader:
# Fragment ids are field 8 and 9
curr_pair = [pair[7], pair[8]]
# Increment number of occurences for fragment pair
if prev_pair == curr_pair:
n_occ += 1
# Write previous pair and start a new one
else:
if n_occ > 0:
write_mat_entry(prev_pair[0], prev_pair[1], n_occ)
prev_pair = curr_pair
n_pairs += n_occ
n_occ = 1
n_nonzero += 1
# Write the last value
write_mat_entry(curr_pair[0], curr_pair[1], n_occ)
n_nonzero += 1
n_pairs += 1
# Edit header line to fill number of nonzero entries inplace in graal header
if mat_fmt == "graal":
with open(mat_file) as mat, open(pre_mat_file, "w") as tmp_mat:
header = mat.readline()
header = header.replace("-", str(n_nonzero))
tmp_mat.write(header)
st.copyfileobj(mat, tmp_mat)
# Replace the matrix file with the one with corrected header
os.rename(pre_mat_file, mat_file)
else:
os.remove(pre_mat_file)
logger.info(
"%d pairs used to build a contact map of %d bins with %d nonzero entries.",
n_pairs,
n_frags,
n_nonzero,
)
def check_tool(name):
"""Check whether `name` is on PATH and marked as executable."""
return which(name) is not None
def full_pipeline(
genome,
input1,
input2=None,
aligner="bowtie2",
centromeres=None,
exclude=None,
circular=False,
distance_law=False,
enzyme=5000,
filter_events=False,
force=False,
mapping="normal",
mat_fmt="cool",
binning=0,
zoomify=True,
balancing_args=None,
min_qual=30,
min_size=0,
no_cleanup=False,
out_dir=None,
pcr_duplicates=False,
plot=False,
prefix=None,
read_len=None,
remove_centros=None,
start_stage="fastq",
threads=1,
tmp_dir=None,
):
"""
Run the whole hicstuff pipeline. Starting from fastq files and a genome to
obtain a contact matrix.
Parameters
----------
genome : str
Path to the bowtie2/bwa index prefix if using bowtie2/bwa or to the genome
in fasta format if using minimap2.
input1 : str
Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
in BAM format, or the pairs file, depending on the value of start_stage.
input2 : str
Path to the Hi-C reads in fastq format (forward), the aligned Hi-C reads
in BAM format, or None, depending on the value of start_stage.
binning : int
If mat_fmt is set to "cool", the cool file will be further binned to
this resolution.
zoomify : bool
Whether to zoomify binned cool matrix (only used if mat_fmt == "cool" and binning is set)
balancing_args : str
Arguments to pass to `chromosight balance` (default: None) (only used if zoomify == True)
enzyme : int or strtest_data/genome/seq.fa
circular : bool
Use if the genome is circular.
out_dir : str or None
Path where output files should be written. Current directory by default.
tmp_dir : str or None
Path where temporary files will be written. Creates a "tmp" folder in
out_dir by default.
plot : bool
Whether plots should be generated at different steps of the pipeline.
Plots are saved in a "plots" directory inside out_dir.
min_qual : int
Minimum mapping quality required to keep a pair of Hi-C reads.
min_size : int
Minimum contig size required to keep it.
threads : int
Number of threads to use for parallel operations.
no_cleanup : bool
Whether temporary files should be deleted at the end of the pipeline.
mapping : str
normal|iterative|cutsite. Use normal, iterative or cutsite mapping.
"normal": Normal alignement. "iterative": Truncates and extends reads
until unambiguous alignment. "cutsite": Digest reads at religation sites
and build new pairs from the fragments created.
filter_events : bool
Filter spurious or uninformative 3C events. Requires a restriction enzyme.
force : bool
If True, overwrite existing files with the same name as output.
prefix : str or None
Choose a common name for output files instead of default graal names.
start_stage : str
Step at which the pipeline should start. Can be "fastq", "bam", "pairs"
or "pairs_idx". With starting from bam allows to skip alignment and start
from named-sorted bam files. With
"pairs", a single pairs file is given as input, and with "pairs_idx", the
pairs in the input must already be attributed to fragments and fragment
attribution is skipped.
mat_fmt : str
Select the output matrix format. Can be either "bg2" for the
bedgraph2 format, "cool" for Mirnylab's cool format, or graal for a
plain text COO format compatible with Koszullab's instagraal software.
aligner : str
Read alignment software to use. Can be either "minimap2", "bwa" or "bowtie2".
pcr_duplicates : bool
If True, PCR duplicates will be filtered based on genomic positions.
Pairs where both reads have exactly the same coordinates are considered
duplicates and only one of those will be conserved.
distance_law : bool
If True, generates a distance law file with the values of the probabilities
to have a contact between two distances for each chromosomes or arms if the
file with the positions has been given. The values are not normalized, or
averaged.
centromeres : None or str
If not None, path of file with Positions of the centromeres separated by a
space and in the same order than the chromosomes.
exclude : None or str
If not None, the name of the chromosomes to remove (e.g. "2u,chrM")
read_len : int
Maximum read length to expect in the fastq file. Optionally used in iterative
alignment mode. Estimated from the first read by default. Useful if input fastq
is a composite of different read lengths.
remove_centros : None or int
If the distance law is computed, this is the number of kb that will be removed
around the centromere position given by in the centromere file.
"""
# Check if third parties can be run
if aligner in ("bowtie2", "minimap2", "bwa"):
if (not check_tool(aligner)) | (check_tool(aligner) is None):
logger.error("%s is not installed or not on PATH", aligner)
raise ImportError(f"{aligner} is required.")
else:
logger.error("Incompatible aligner software, choose bowtie2, minimap2 or bwa.")
raise ValueError("aligner should be either bowtie2, minimap2 or bwa.")
if (not check_tool("samtools")) | (check_tool("samtools") is None):
logger.error("samtools is not installed or not on PATH")
raise ImportError("samtools is required.")
if mat_fmt == 'cool':
try:
import cooler
except ImportError:
logger.error(
"The cooler package is require to return matrix in cool format, please install it first."
)
raise ImportError("The cooler package is required.")
# Pipeline can start from 3 input types
start_time = datetime.now()
stages = {"fastq": 0, "bam": 1, "pairs": 2, "pairs_idx": 3}
start_stage = stages[start_stage]
# Check if the number of input files is correct
if start_stage <= 1:
if input2 is None:
logger.error(
"You must provide 2 input files when --start-stage is fastq " "or bam."
)
sys.exit(1)
else:
if input2 is not None:
logger.error(
"You must provide a single input file when --start-stage is "
"pairs or pairs_idx."
)
sys.exit(1)
# sanitize enzyme
enzyme = str(enzyme)
# Remember whether fragments_file has been generated during this run
fragments_updated = False
if out_dir is None:
out_dir = os.getcwd()
if tmp_dir is None:
tmp_dir = join(out_dir, "tmp")
os.makedirs(out_dir, exist_ok=True)
os.makedirs(tmp_dir, exist_ok=True)
# Define figures output paths
if plot:
fig_dir = join(out_dir, "plots")
os.makedirs(fig_dir, exist_ok=True)
if prefix:
frag_plot = join(fig_dir, prefix + "_frags_hist.pdf")
dist_plot = join(fig_dir, prefix + "_event_distance.pdf")
pie_plot = join(fig_dir, prefix + "_event_distribution.pdf")
distance_law_plot = join(fig_dir, prefix + "_distance_law.pdf")
else:
frag_plot = join(fig_dir, "frags_hist.pdf")
dist_plot = join(fig_dir, "event_distance.pdf")
pie_plot = join(fig_dir, "event_distribution.pdf")
distance_law_plot = join(fig_dir, "distance_law.pdf")
matplotlib.use("Agg")
else:
fig_dir = None
dist_plot = pie_plot = frag_plot = None
# Use current time for logging and to identify files
now = time.strftime("%Y%m%d%H%M%S")
def _tmp_file(fname):
if prefix:
fname = prefix + "." + fname
full_path = join(tmp_dir, fname)
if not force and os.path.exists(full_path):
raise IOError(
"Temporary file {} already exists. Use --force to overwrite".format(
full_path
)
)
return full_path
def _out_file(fname):
if prefix:
fname = prefix + "." + fname
full_path = join(out_dir, fname)
if not force and os.path.exists(full_path):
raise IOError(
"Output file {} already exists. Use --force to overwrite".format(
full_path
)
)
return full_path
# Define temporary file names
log_file = _out_file("hicstuff_" + now + ".log")
tmp_genome = _tmp_file("genome.fa.gz")
bam1 = _tmp_file("for.bam")
bam2 = _tmp_file("rev.bam")
pairs = _tmp_file("valid.pairs")
pairs_idx = _tmp_file("valid_idx.pairs")
pairs_filtered = _tmp_file("valid_idx_filtered.pairs")
pairs_pcr = _tmp_file("valid_idx_pcrfree.pairs")
# Enable file logging
hcl.set_file_handler(log_file)
generate_log_header(log_file, input1, input2, genome, enzyme)
# If defautl `mat_fmt` or set `mat_fmt=cool`, notify the user
if mat_fmt == 'cool':
try:
import cooler
logger.info("The default output format is now `.cool`. The Hi-C "
"matrix will be generated with cooler v%s "
"(Abdennur & Mirny, Bioinformatics 2020).",
cooler.__version__
)
except: raise
# If the user chose bowtie2 and supplied an index, extract fasta from it
# For later steps of the pipeline (digestion / frag attribution)
# Check if the genome is an index or fasta file
idx = hio.check_fasta_index(genome, mode=aligner)
is_fasta = hio.check_is_fasta(genome)
# Different aligners accept different files. Make sure the input format is good.
# Note bowtie2 can extract fasta from the index, but bwa cannot
sane_input = {
'bowtie2': is_fasta or idx,
'minimap2': is_fasta,
'bwa': is_fasta
}
if not sane_input[aligner]:
logger.error("You must provide either a fasta or bowtie2 index prefix as genome")
# Just use the input genome if it is indexed
if is_fasta and idx:
fasta = genome
# Otherwise copy it in tmpdir (in compressed format) for indexing, unless the input is a
# bt2 index, in which case fasta will be extracted later from it.
else:
if is_fasta:
with hio.read_compressed(genome, 'rb') as src, gzip.open(tmp_genome, 'wb') as dst:
dst.writelines(src)
genome = tmp_genome
fasta = tmp_genome
# Bowtie2-specific feature: extract fasta from the index
if aligner == 'bowtie2' and not is_fasta:
# Index is present, extract fasta file from it and compress it
bt2fa = sp.Popen(
["bowtie2-inspect", genome],
stdout=sp.PIPE,
stderr=sp.PIPE,
)
_ = sp.run(['gzip', '-c'], stdin=bt2fa.stdout, stdout=open(tmp_genome, "w"))
_, bt2err = bt2fa.communicate()
# bowtie2-inspect still has return code 0 when crashing, need to
# actively look for error in stderr
if re.search(r"[Ee]rror", bt2err.decode()):
logger.error(bt2err)
logger.error(
"bowtie2-inspect has failed, make sure you provided "
"the path to the bowtie2 index without the extension."
)
sys.exit(1)
# Build index with bowtie2 / bwa if required
if idx is None and aligner in ['bowtie2', 'bwa']:
if aligner == 'bowtie2':
index_cmd = ["bowtie2-build", '-q', fasta, fasta]
elif aligner == 'bwa':
index_cmd = ['bwa', 'index', fasta]
# We only need the index if the user provided fastq input
if start_stage == 0:
# If no index present assume input is fasta, copy it in tmp and
# index it (to avoid conflict between instances)
logger.info(
"%s index not found at %s, generating "
"a local temporary index.", aligner, genome
)
sp.run(index_cmd, stderr=sp.PIPE)
# Check for spaces in fasta headers and issue error if found
for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"):
if " " in record.id:
logger.error(
"Sequence identifiers contain spaces. Please clean the input genome."
)
# Define output file names (tsv files)
if prefix:
fragments_list = _out_file("frags.tsv")
info_contigs = _out_file("chr.tsv")
mat = _out_file("mat.tsv")
# If matrix has a different format, give it the right extension
if mat_fmt != "graal":
mat = _out_file(mat_fmt)
else:
# Default graal file names
fragments_list = _out_file("fragments_list.txt")
info_contigs = _out_file("info_contigs.txt")
mat = _out_file("abs_fragments_contacts_weighted.txt")
if mat_fmt != "graal":
mat = _out_file("abs_fragments_contacts_weighted." + mat_fmt)
# Define what input files are given
if start_stage == 0:
reads1, reads2 = input1, input2
elif start_stage == 1:
bam1, bam2 = input1, input2
elif start_stage == 2:
pairs = input1
elif start_stage == 3:
pairs_idx = input1
# Perform genome alignment
nreads_input1 = 0
if start_stage == 0:
# Check number of reads in both fastqs
logger.info("Checking content of fastq files.")
nreads_input1 = hio.check_fastq_entries(reads1)
nreads_input2 = hio.check_fastq_entries(reads2)
if (nreads_input1 != nreads_input2):
logger.error("Fastq files do not have the same number of reads.")
else:
logger.info("{n} reads found in each fastq file.".format(n = int(nreads_input1)))
# Define mapping choice (default normal):
if mapping == "normal":
iterative = False
elif mapping == "iterative":
iterative = True
elif mapping == "cutsite":
# If no enzyme given use iterative alignment.
try:
int(enzyme)
logger.warning("No enzyme has been given. Can't map using cutsite, iterative mapping will be used instead.")
iterative = True
# If cutsite enabled and enzyme given, cut the reads before making a
# normal alignment.
except ValueError:
iterative = False
digest_for = _tmp_file("digest_for.fq.gz")
digest_rev = _tmp_file("digest_rev.fq.gz")
hcc.cut_ligation_sites(
fq_for=reads1,
fq_rev=reads2,
digest_for=digest_for,
digest_rev=digest_rev,
enzyme=enzyme,
mode="for_vs_rev",
seed_size=20,
n_cpu=threads,
)
reads1, reads2 = digest_for, digest_rev
else:
logger.error("mapping must be either normal, iterative or cutsite.")
raise ValueError
align_reads(
reads1,
genome,
bam1,
tmp_dir=tmp_dir,
threads=threads,
aligner=aligner,
iterative=iterative,
min_qual=min_qual,
read_len=read_len,
)
align_reads(
reads2,
genome,
bam2,
tmp_dir=tmp_dir,
threads=threads,
aligner=aligner,
iterative=iterative,
min_qual=min_qual,
read_len=read_len,
)
# Detect if multiple enzymes are given
if re.search(",", enzyme):
enzyme = enzyme.split(",")
# Starting from bam files
if start_stage <= 1:
# Check number of reads in both fastqs
if (bam1 == input1):
logger.info("Checking content of bam files.")
nreads_input1 = hio.check_bam_entries(bam1)
nreads_input2 = hio.check_bam_entries(bam2)
if (nreads_input1 != nreads_input2):
logger.error("Bam files do not have the same number of reads.")
else:
logger.info("{n} reads found in each bam file.".format(n = nreads_input1))
fragments_updated = True
# Generate info_contigs and fragments_list output files
hcd.write_frag_info(
fasta,
enzyme,
min_size=min_size,
circular=circular,
output_contigs=info_contigs,
output_frags=fragments_list,
)
# Log fragment size distribution
hcd.frag_len(frags_file_name=fragments_list, plot=plot, fig_path=frag_plot)
# Make pairs file (readID, chr1, chr2, pos1, pos2, strand1, strand2)
bam2pairs(bam1, bam2, pairs, info_contigs, min_qual=min_qual)
# Starting from pairs file
if start_stage <= 2:
restrict_table = {}
for record in SeqIO.parse(hio.read_compressed(fasta), "fasta"):
# Get chromosome restriction table
restrict_table[record.id] = hcd.get_restriction_table(
record.seq, enzyme, circular=circular
)
# Add fragment index to pairs (readID, chr1, pos1, chr2,