-
Notifications
You must be signed in to change notification settings - Fork 1
/
lsd_high_sumstats_calculator_OBS.py
565 lines (492 loc) · 40.8 KB
/
lsd_high_sumstats_calculator_OBS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
#!/usr/bin/python
# lsd_high_sumstats_calculator_OBS version
version = "lsd_high_sumstats_calculator_OBS version: 30_03_2020"
############################################## ALLELE FREQUENCIES CONVERTER (OBSERVED) AND SUMSTATS CALCULATOR ##############################################
### NOTES AND EXAMPLES ###
# This is the main (master) function that runs the BAM2AlleleFreqs (observed data) converter and summary statistics calculator functions. Use this to run analyses for the observed data #
# It takes as input a list of mpileup files (produced from BAM files via: samtools mpileup example.bam) and produces an output of summary statistics #
# BAM file from which mpileup file is produced must be sorted (samtools sort -T /tmp/aln.sorted -o aln.sorted.bam aln.bam) #
# EXTENSION: This version has been extended to produce sumstat files for multiple windows in a scaffold.
## EXAMPLE USAGE
# cd /Users/luqman/Documents/Hirzi/ETHPHD/Analysis/PoolSimulator
# Example usage (GENOME-WIDE, NEUTRAL REGIONS, 5000kbMakerFlanked-1000kbOtherFlanked, anchored scaffolds > 50kb): python lsd_high_sumstats_calculator_OBS.py genomeWide_5000_1000_anchored50_filelist.txt -d 40 -d 40 -d 40 -d 40 -d 40 -d 40 -q 0 -m 2 -o GW_Neutral_5000_1000_anchored_50kb -f ABC- -r multiple --mindepth 10 --maxdepth 200 --windowSize 5000 --scaffold --pooled
# Example usage (ALL (FULLY) NEUTRAL SCAFFOLDS, CHR 1): python lsd_high_sumstats_calculator_OBS.py Chr1_filelist.txt -d 40 -d 40 -d 40 -d 40 -d 40 -d 40 -q 0 -m 2 -o Chr1 -f ABC- -r multiple --windowSize 5000 --mindepth 10 --maxdepth 100 --scaffold --pooled
# Example usage (ALL (FULLY) NEUTRAL SCAFFOLDS, CHR 13): python lsd_high_sumstats_calculator_OBS.py Chr13_filelist.txt -d 40 -d 40 -d 40 -d 40 -d 40 -d 40 -q 0 -m 2 -o Chr13 -f ABC- -r multiple --windowSize 5000 --mindepth 10 --maxdepth 100 --scaffold --pooled
# Example usage using command line input: python lsd_high_sumstats_calculator_OBS.py scaffold4_size532381_filelist.txt -d 40 -d 40 -d 40 -d 40 -d 40 -d 40 -q 20 -m 1 -o scaffold4_size532381_minDP10maxDP200 -f ABC --startPos 1 --endPos 532381 --mindepth 10 --maxdepth 200 --windowSize 10000 -r single --pooled
# Example usage using command line input: python lsd_high_sumstats_calculator_OBS.py scaffold1_size1318325_1_614794_filelist.txt -d 40 -d 40 -d 40 -d 40 -d 40 -d 40 -q 20 -m 1 -o scaffold1_size1318325_1_614794_minDP10maxDP200 -f ABC -r single --mindepth 10 --maxdepth 200 --pooled
# Example usage, individual (non-pooled) data, with 6 individuals per population: python lsd_high_sumstats_calculator_OBS.py test_6individuals_pop.filelist -d 6 -d 6 -q 20 -m 1 -o scaffold4_size532381_test_individuals_pop -f ABC -r single
# Example usage, individual (non-pooled) data, with 6 individuals per population (10kb windows): python lsd_high_sumstats_calculator_OBS.py test_6individuals_pop.filelist -d 6 -d 6 -q 20 -m 1 -o scaffold4_size532381_test_individuals_pop_windows -f ABC --startPos 1 --endPos 532381 --windowSize 10000 -r single
# Script won't work with < 6 individuals per pop (unpolarised data) as is because it by default outputs variant frequencies up till the tripleton class. Either change outputted frequency classes, or ignore if you anyway won't work with such small sample sizes.
# Script won't work with < 4 individuals per pop Tajima's D require at least this number of individuals per population.
## NOTES
# Note 0: A window size of 0 (default) will calculate the allele frequencies and summary statistics for the region defined between the start_position and the end_position. A window size > 0 will calculate allele frequencies and summary statistics on a per window basis.
# Note 1: Here, the minimum allele count is defined as follows: If allele frequency is greater than minimum allele count, then we keep. If equal to or below minimum allele count, we discard. E.g. a minimum allele count = 2 means any alleles with counts of 2 or below are discarded (need a minimum of 3 to keep).
# Note 2: The quality filter removes reads with Phred quality scores of less than or equal to the defined tag. E.g. -q 20 removes all reads with quality scores less than and equal to 20.
### Note X: If you recieve the error: "ValueError: could not broadcast input array from shape (i) into shape (j)", where i and j are numbers, this points to lines 163-167 in the BAM2AlleleFreqs.py script. There appears to be a bug in pandas, when outputting data of a certain shape into a new column. See: https://github.com/pandas-dev/pandas/pull/18577
## NOTES: SAMPLE REQUIREMENTS
# Note 1: Tajima's D requires there to be >= 4 individuals per population.
# Note 2: Variant class frequencies (from the SFS, e.g. singletons, doubletons, tripletons etc) require n individuals (polarised data) or 2n individuals (polarised data) per population for calculation, where n is the variant class frequency desired. By default, only doubletons and tripletons are outputted (singletons are by default excluded because they are too sensitive to error). Adjust accordingly depending on needs.
# Note 3: Negative FST and DXY values are collapsed to 0.
# Note 4: In the case of two populations, private S will always be the same. This is because a site that is private in pop1 must necessarily be private in pop2.
## We import the necessary modules
import argparse
import os
import pandas
import BAM2AlleleFreqs
import sumstatscalc
#import csv
#import glob
#import numpy
from datetime import datetime
# os.chdir("/Users/luqman/Documents/Hirzi/ETHPHD/Analysis/PoolSimulator")
#%%
### PARSE ARGUMENTS ###
## We parse arguments from the command line using argparse
parser = argparse.ArgumentParser(description="LSD formatter and summary statistics calculator for observed sequence data (version 0.1, Luqman 2020)", add_help=True)
# We allow for two input options:
"""
# 1. We parse arguments from a supplied parameter file:
if glob.glob("/Users/luqman/Documents/Hirzi/ETHPHD/Analysis/PoolSimulator/*.params"): # Glob will return a list of files matching the wildcard-accessible path. If there are no files, it will return an empty list. This is really just os.listdir and fnmatch.filter together.
parser.add_argument("paramfile", action="store", help="Parameters may either be entered in the command line as arguments or provided in a separate parameter file. If a .params file is provided in the working directory, this will be used. If no parameter file provided in the working directory, retype -h for full argument options (full command-line argument options are only shown when no .params file exists in the working path)")
parser.add_argument("-o", action="store", default="output", dest="output", help="prefix for output files")
args = parser.parse_args()
paramfile = args.paramfile
with open(paramfile) as paramfile: # We parse arguments from from the input parameter file
reader = list(csv.reader(paramfile, delimiter="\t"))
arg_dict = {}
for row in reader:
arg_dict[row[0]] = row[1]
## We associate the parsed arguments with our global variables (defined in the functions below)
filename = arg_dict["filename"]
# no_iterations = int(arg_dict["no_iterations"])
pop_size = [int(x) for x in arg_dict["pop_size"].replace(" ","").split(",")]
span = arg_dict["across"]
plotSFS = bool(arg_dict["plotSFS"] == "false")
plotPi = bool(arg_dict["plotPi"] == "false")
output_prefix = arg_dict["output"]
output_format = arg_dict["output_format"]
quality_filter = int(arg_dict["quality_filter"])
min_allele_count = int(arg_dict["min_allele_count"])
start_position = int(arg_dict["start_position"])
end_position = int(arg_dict["end_position"])
"""
# 2. Or we parse arguments directly from the command line:
#else:
parser.add_argument("filename", action="store", help="The input file, which is a list of mpileup files.")
#parser.add_argument("-n", action="store", dest="nrep", default=1, type=int, help="The number of independent iterations (alternatively, can be interpreted as the number of loci simulated). N/A for observed data")
parser.add_argument("-d", action="append", required=True, dest="demesizes", help="The sample sizes of the different populations.")
parser.add_argument("-a", action="store", dest="across", default="segsites", help="Define whether to calculate Fst summary statistics across only segregating sites (segsites) or across all sites (allsites).")
parser.add_argument("--polarised", action="store_true", default=False, dest="polarised", help="Define whether to consider the simulated sequences as polarised (ancestral:0 - derived:1) or unpolarised (major:most frequent allele - minor: 2nd most frequent allele).")
parser.add_argument("--pooled", action="store_true", default=False, dest="pooled", help="Define whether data (BAM, mpileup files) is individual data or pooled data.")
parser.add_argument("--error_method", action="store", dest="error_method", default="4", help="Error calculation method (five options: 0, 1, 2, 3 and 4). Error method 0 doesn't account for error, only for stochastic sampling bias from the pooling process. Error methods 1, 2, 3 and 4 account for stochastic sampling bias from the pooling process in addition to randomly including error at each read per site at (probability) error_rate. Error method 1: If the error leads to a change to alleles 0 or 1, the read with the error is retained and appended to the total counts. If the error leads to inobservable alleles 2 and 3, the reads are discarded (no sites discarded in this method). Error method 2: All reads are retained, regardless of whether the error leads to alleles 0, 1, 2 or 3. If allele 2 and/or 3 is above the minimum allele count, we consider it a triallelic site and discard the site. If they are equal to or below the minimum allele count, and allele 1 is above the minimum allele count, we consider the site biallelic (for alleles 0 and 1) , and retain the original coverage (no reads are discarded in this method). Error method 3: All reads are retained, regardless of whether the error leads to alleles 0, 1, 2 or 3. This method counts the two most frequent alleles (the ancestral + the most frequent minor allele between allele 1, allele 2 and allele 3), given that the 1) most frequent minor allele is > the minumum allele count, and 2) the the most frequent minor allele is NOT equal to the second most frequent minor allele (otherwise site is considered triallelic and discarded). If these two points are satisfied, the two least frequent alleles are ignored and we consider the site biallelic, whilst retaining the original coverage (no reads are discarded here). Error method 4: An approximation of error method 3, which results in approximately 5x shorter runtimes. The approximation assumes that all retained erroneous sites have (min_allele_count + 1) number of errors. Sites with less number of errors are filtered by the minimum allele criterion, sites with more errors are exponentially much less likely, and to the closest approximation, results in negligable number of entries.")
parser.add_argument("--plotSFS", action="store_true", default=False, dest="plotSFS", help="To output plots of site frequency spectrum (one for each population).")
parser.add_argument("--plotPi", action="store_true", default=False, dest="plotPi", help="To output plots of nucleotide diversity, pi, across site positions (one for each population).")
parser.add_argument("-o", action="store", default="summary_stats_temp.txt", dest="output", help="Prefix for output files.")
parser.add_argument("-f", action="store", default="ABC", dest="output_format", help="Format of output file. 'Full' outputs a convenient, human-readable summary output while 'ABC' outputs the file in the format required for ABCToolbox.")
parser.add_argument("-q", action="store", dest="quality_filter", default=0, type=int, help="Remove reads with PHRED quality-score less than this value.")
parser.add_argument("-m", action="store", dest="min_allele_count", default=0, type=int, help="Apply a minimum allele count filter.")
parser.add_argument("-r", action="store", dest="region", help="Select 'single' when calculating summary statistics for a single region or scaffold, or 'multiple' when calculating summary statistics for multiple scaffolds/regions. The 'single' option allows defining custom sub-regions within a scaffold (e.g. between sites 100 to 90000); the 'multiple' option analyses all sites within all given regions/scaffolds.")
parser.add_argument("--mindepth", action="store", dest="min_cov", default=0, type=int, help="Minimum depth filter")
parser.add_argument("--maxdepth", action="store", dest="max_cov", default=999, type=int, help="Maximum depth filter")
parser.add_argument("--startPos", action="store", dest="start_position", default=0, type=int, help="The start position (bp) of the region of interest. Only called (is mandatory) when program is run for a single region; ignored if running for multiple regions, as in this case, all sites of all regions are analysed.")
parser.add_argument("--endPos", action="store", dest="end_position", default=0, type=int, help="The end position (bp) of the region of interest. Only called (is mandatory) when program is run for a single region; ignored if running for multiple regions, as in this case, all sites of all regions are analysed.")
parser.add_argument("--windowSize", action="store", dest="window_size", default=0, type=int, help="Splits the scaffold into windows. Summary statistics will be calculated per window.")
parser.add_argument("--windowStep", action="store", dest="window_step", default=None, type=int, help="Defines the length (number of sites) till the subsequent window. If window_step < window_size, windows will be overlapping; if window_step = window_size, windows will be non-overlapping and adjacent; if window_step > window_size, windows will be non-overlapping with gaps equal to the difference.")
#parser.add_argument("--windowMissing", action="store", dest="missing_data_filter", default=0.5, type=float, help="(Scaffold) windows with a certain fraction of missing sites will be excluded from calculation of summary statistics.")
parser.add_argument("--baseJump", action="store", dest="base_jump", default=100, type=int, help="When sites are missing in the scaffold (for defining windows), this parameter defines the jump size to the next available site.")
parser.add_argument("--scaffold", action="store_true", default=False, dest="scaffold_specific", help="If set as true, calculates allele frequencies and summary statistics per scaffold (scaffolds not merged).")
args = parser.parse_args()
## We associate the parsed arguments with our global variables (defined in the functions below)
filename = args.filename
#no_iterations = args.nrep
pop_size = [int(x) for x in args.demesizes]
span = args.across
polarised = args.polarised
pooled = args.pooled
error_method = args.error_method
plotSFS = args.plotSFS
plotPi = args.plotPi
output_prefix = args.output
output_format = args.output_format
quality_filter = args.quality_filter
min_allele_count = args.min_allele_count
region = args.region
min_cov = args.min_cov
max_cov = args.max_cov
start_position = args.start_position
end_position = args.end_position
window_size = args.window_size
window_step = args.window_step
#missing_data_filter = args.missing_data_filter
base_jump = args.base_jump
scaffold_specific = args.scaffold_specific
# We define the indirect arguments:
scaffold_seq_length = end_position - start_position
no_pops = len(pop_size)
## To test if this works:
print("Filename is: " + str(filename))
#print("Number of iterations is: " +str(no_iterations))
print("List of respective population sample sizes are: " + str(pop_size))
print("Summary statistics will be calculated for " + str(region) + " region(s)/scaffold(s)")
if scaffold_specific == True:
print("Summary statistics will be calculated per scaffold (scaffolds will not be merged)")
if region == "single" and start_position != 0 and end_position != 0:
print("Summary statistics will be calculated for the region between " + str(start_position) + "bp and " + str(end_position) + "bp")
print("The length of the scaffold sequence is: " + str(scaffold_seq_length) + "bp")
if region == "single" and start_position == 0 and end_position == 0:
print("Summary statistics will be calculated for the whole scaffold" )
if window_size > 0:
print("Window size is " + str(window_size) + "bp")
print("Summary statistics are calculated per " + str(window_size) + "bp windows")
print("Windows are generated in " + str(window_step) + "bp steps")
print("Remove reads of quality < " + str(quality_filter))
print("Error method: " + str(error_method))
print("The observed data is polarised: " + str(polarised))
print("The observed data is pooled: " + str(pooled))
print("Applying a filter on minimum allele count. Alleles with counts </= " + str(min_allele_count) + " will be removed")
print("Minimum depth filter: " + str(min_cov))
print("Maximum depth filter: " + str(max_cov))
#print("Fst and Dxy statistics are calculated across " + str(span))
print("Plots of SFS are produced: " + str(plotSFS))
print("Plots of pi are produced: " + str(plotPi))
print("Prefix for output files is: " + str(output_prefix))
print("Output format is: " + str(output_format))
#%%
### DEFINE THE FUNCTIONS ###
# This function returns a set (list) of the common sites, among all populations
def shared_sites(file_list, region, no_pops, pooled, scaffold_specific, list_of_scaffolds=[]):
with open(file_list) as f:
file_list_e = f.read().splitlines()
if pooled == True:
filelist = file_list_e
elif pooled == False:
filelist_temp = [x.split(",") for x in file_list_e]
filelist = [item for sublist in filelist_temp for item in sublist]
# Define empty list and dicts
positions_master=[]
master={}
# Read as pandas dataframe the mpileup files
for count,file in enumerate(filelist):
# print("Reading file: " + str(file))
test_df = pandas.read_csv(file, sep='\t', header=None)
test_df.columns = ['scaffold', 'position_original', 'reference', 'depth', 'reads', 'quality']
#pop_df = calc_allele_freqs_mpileup(file, start_position, end_position, quality_filter)
if region == "multiple":
test_df["position"] = test_df["scaffold"].astype(str) + "_pos" + test_df["position_original"].astype(str)
cols = ['scaffold', 'position_original', 'position', 'reference', 'depth', 'reads', 'quality']
test_df = test_df[cols]
elif region == "single":
test_df.rename(columns = {'position_original':'position'}, inplace = True)
master["Pop"+str(count+1)] = test_df
positions = set(list(test_df["position"]))
positions_master.append(positions)
# Calculate the common (intersection of) sites between the 6 populations
across_pop_sites = list(set.intersection(*positions_master)) # This finds the intersection of a list of sets, where *a_list is list expansion
across_pop_sites.sort()
if region == "single":
print("Number of common base pairs (across all populations) in scaffold is: " + str(len(across_pop_sites)))
return across_pop_sites
if region == "multiple" and scaffold_specific == False:
across_pop_sites = list(range(1,len(across_pop_sites)+1)) # This line follows that we have added and defined the unique position index: "position_reordered" = numpy.asarray(list(range(1,len(popall_df)+1))) in the dataframe, following the same intersection step. Hence we can match according to this index list.
print("Number of common base pairs (across all populations) in scaffold is: " + str(len(across_pop_sites)))
return across_pop_sites
elif region == "multiple" and scaffold_specific == True:
# Produce a dataframe of scaffold, position_original and position by trimming the mpileup files to retain only the common (intersected) sites
master_trimmed={}
for pop in range(no_pops):
pop_df = master["Pop"+str(pop+1)]
pop_df_trimmed_raw = pop_df[pop_df["position"].isin(across_pop_sites)]
pop_df_trimmed = pop_df_trimmed_raw.reset_index(drop=True)
master_trimmed["Pop"+str(pop+1)] = pop_df_trimmed
# Remember, here we just need the intersecting positions (which is identical and included in all population dataframes), so we can just choose 1 population
common_sites_df = master_trimmed["Pop1"]
# Drop unnecessary columns
common_sites_df.drop("reads", axis=1, inplace=True)
common_sites_df.drop("quality", axis=1, inplace=True)
common_sites_df.drop("depth", axis=1, inplace=True)
common_sites_df.drop("reference", axis=1, inplace=True)
# common_sites_df_copy = common_sites_df.copy()
# Index by scaffold, to allow easy partioning of dataframe by scaffold later
common_sites_df.set_index(keys=['scaffold'], drop=False,inplace=True)
# names = common_sites_df['scaffold'].unique().tolist() # to avoid bugs (e.g. in the case that there is a mismatch in the scaffold list of across_pop_sites_per_scaffold and allele_freqs_df_scaffold, which may arise in the case that some scaffolds have no variant sites), we loop and extract only over scaffolds which have variant sites (taken from the allele_freqs_df_scaffold scaffold list)
# Partition dataframe by scaffold
common_sites_df_list_scaffold_partitions = []
for scaffold in list_of_scaffolds: # this names_scaffols is taken from names_scaffolds = allele_freqs_df['scaffold'].unique().tolist()
# print(scaffold)
common_sites_df_list_scaffold_partitions.append(common_sites_df.loc[common_sites_df.scaffold==str(scaffold)])
# Produce a LIST of across pop sites per scaffold
across_pop_sites_per_scaffold_list = []
for i in common_sites_df_list_scaffold_partitions:
across_pop_sites_per_scaffold = list(i["position_original"])
across_pop_sites_per_scaffold_list.append(across_pop_sites_per_scaffold)
return across_pop_sites_per_scaffold_list
# NOTE, THIS FUNCTION HAS BEEN DEPRECIATED AND REPLACED BY THE FUNCTION BELOW. This function returns the start and end positions of the desired windows, defining FIXED regions (e.g. if 10k windows: 1-10000, 10001-20000, 20001 - 30000, ...etc. ), given a defined start position, end position, and window size. Tolerance for amount of missing data can be set *NOTE if this tolerance is <1, different summary statistics will be affected differently (e.g. S, watteson's theta correlate directly with length of sequence, while e.g. pi, Fst, etc. are divided over sequence length)
# Note 0: This function hasn't been updated to work with multiple regions/scaffolds.
# Note 1: positions are just that positions, NOT python indexes, and thus they start from 1.
# Note 2: the start_position and end_position arguments are the start and end positions/sites of the e.g. scaffold.
#def make_windows_fixedRegions(start_position, end_position, window_size, missing_data_filter, base_jump):
# positions={}
# # Make list of start and end positions, based on window size and region of interest
# list_start_position = []
# list_end_position = []
# while start_position + window_size <= end_position: # or <
# list_start_position.append(start_position)
# start_position = start_position + window_size
# list_end_position.append(start_position - 1)
# # Account for the cases where start position is missing (by searching for the next available starting position)
# list_start_position_data_raw =[]
# for start_position in list_start_position:
# if start_position in across_pop_sites:
# list_start_position_data_raw.append(start_position)
# else:
# while start_position not in across_pop_sites:
# start_position = start_position + base_jump
# list_start_position_data_raw.append(start_position)
# # Account for the cases where end position is missing (by searching for the previous available ending position)
# list_end_position_data_raw =[]
# for end_position in list_end_position:
# if end_position in across_pop_sites:
# list_end_position_data_raw.append(end_position)
# else:
# while end_position not in across_pop_sites:
# end_position = end_position - base_jump
# list_end_position_data_raw.append(end_position)
# # Remove windows with no or little (defined by missing_data_filter tag) data.
# size_end_start_list = []
# for i,j in zip(list_start_position_data_raw,list_end_position_data_raw):
# size_end_start_list.append(len(across_pop_sites[i:j]))
# size_end_start = numpy.array(size_end_start_list)
# gaps_to_remove = numpy.where(size_end_start < (window_size * missing_data_filter))[0]
# # Remove start and end positions which define the windows (gaps) in scaffold with insufficient data
# list_start_position_data = []
# for index, start_pos in enumerate(list_start_position_data_raw):
# if index not in gaps_to_remove:
# list_start_position_data.append(start_pos)
# list_end_position_data = []
# for index, end_pos in enumerate(list_end_position_data_raw):
# if index not in gaps_to_remove:
# list_end_position_data.append(end_pos)
# # Check output and return
# if len(list_start_position_data) != len(list_end_position_data):
# print("Error: Different number of start and end positions")
# positions["start_positions"] = list_start_position_data
# positions["end_positions"] = list_end_position_data
# return positions
# This function returns the start and end positions of the desired windows, defining windows of FIXED length (e.g. if 10k windows: the 1st 10000 sites, the 2nd 10000 sites, the 3rd 10000 sites, etc), given a defined start position, end position, and window size.
# Note 1: positions are just that positions, NOT python indexes, and thus they start from 1.
# Note 2: the start_index and end_index arguments are the start and end indexes of the total number of shared sites.
# Note 3: window_step is defined as the length (number of sites) till the subsequent window. If window_step < window_size, windows will be overlapping, if window_step = window_size, windows will be non-overlapping and adjacent, if window_step > window_size, windows will be non-overlapping with gaps equa to the difference.
def make_windows_fixedLength(across_pop_sites, region, start_position, end_position, start_index, end_index, window_size, window_step = None):
positions = {}
list_start_indexes = []
list_end_indexes = []
while start_index + window_size <= end_index:
list_start_indexes.append(start_index)
if window_step != None:
final_index = start_index + window_size
list_end_indexes.append(final_index)
start_index = start_index + window_step
elif window_step == None:
start_index = start_index + window_size
list_end_indexes.append(start_index)
list_start_positions_all = []
for index in list_start_indexes:
start_pos = across_pop_sites[index]
list_start_positions_all.append(start_pos)
list_end_positions_all = []
for index in list_end_indexes:
end_pos = across_pop_sites[index]
list_end_positions_all.append(end_pos)
if region == "single":
list_start_positions_trim_start = [x for x in list_start_positions_all if x >= start_position]
num_trimmed_start = len(list_start_positions_all) - len(list_start_positions_trim_start)
list_end_positions_trim_start = list_end_positions_all[num_trimmed_start:]
list_end_positions_trim_start_end = [x for x in list_end_positions_trim_start if x <= end_position]
num_trimmed_end = len(list_end_positions_trim_start) - len(list_end_positions_trim_start_end)
if num_trimmed_end == 0:
list_start_positions_trim_start_end = list_start_positions_trim_start
elif num_trimmed_end > 0:
list_start_positions_trim_start_end = list_start_positions_trim_start[:(-num_trimmed_end)]
positions["start_positions"] = list_start_positions_trim_start_end
positions["end_positions"] = list_end_positions_trim_start_end
elif region == "multiple":
positions["start_positions"] = list_start_positions_all
positions["end_positions"] = list_end_positions_all
return positions
#%%
### HERE IS THE MAIN FUNCTION ###
## Finally, we run the BAM2AlleleFreqs and sumstats_calc functions:
if __name__ == "__main__":
startTime = datetime.now()
print(version)
if window_size == 0:
# Obtain the set (list) of the common sites, among all populations
across_pop_sites_all = shared_sites(file_list = filename, region = region, no_pops = no_pops, pooled = pooled, scaffold_specific = False)
# Calculates sequence length from which to calculate summary statistics
if start_position != 0 and end_position != 0:
start_pos_index = across_pop_sites_all.index(start_position)
end_pos_index = across_pop_sites_all.index(end_position)
seq_length = len(across_pop_sites_all[start_pos_index:end_pos_index])
elif start_position == 0 and end_position == 0:
start_position = across_pop_sites_all[0]
end_position = across_pop_sites_all[-1]
seq_length = len(across_pop_sites_all)
# Generate allele frequency dataframe for selected sites, from a list of mpileup files
allele_freqs_df = BAM2AlleleFreqs.calc_pooled_allele_freqs_mpileup(file_list = filename, no_pops = no_pops, pooled = pooled, quality_filter = quality_filter, min_allele_count = min_allele_count, error_method = error_method, region = region, pop_size = pop_size, min_cov = min_cov, max_cov = max_cov, start_position = start_position, end_position = end_position)
if scaffold_specific == False:
if region == "single":
print("Calculating allele frequencies and summary statistics for scaffold region: " + str(start_position) + "bp - " + str(end_position) + "bp")
elif region == "multiple":
print("Calculating allele frequencies and summary statistics for multiple (merged) scaffolds, of sum total length of: " + str(seq_length) + "bp")
# Convert allele frequencies dataframe into input format for LSD-High (lsd_high)
raw_allele_freqs = BAM2AlleleFreqs.mpileup2lsd(allele_freqs_df = allele_freqs_df, no_pops = no_pops, region = region, window_size = window_size, start_position = start_position, end_position = end_position) # note here, as we want, start and end position arguments are not called, since window == 0 always in this clause. Region selection already performed in above step.
# Calculate the summary statistics
sumstatscalc.sumstats_calc (raw_allele_freqs, pop_size = pop_size, seq_length = seq_length, span = span, polarised = polarised, output = output_format, plotSFS = plotSFS, plotPi = plotPi)
# Output summary statistic files
if region == "single":
os.rename("summary_stats_temp.txt", output_prefix + "_" + str(start_position) + "_" + str(end_position) + "_M" + str(min_allele_count) + "_Q" + str(quality_filter) + ".obs")
elif region == "multiple":
os.rename("summary_stats_temp.txt", output_prefix + "_allRegions" + "_M" + str(min_allele_count) + "_Q" + str(quality_filter) + ".obs")
elif scaffold_specific == True:
# First we split dataframe into multiple scaffold-specific dataframes
allele_freqs_df.set_index(keys=['scaffold'], drop=False,inplace=True)
names_scaffolds = allele_freqs_df['scaffold'].unique().tolist()
# We can then loop over the scaffold-specific dataframes
allele_freqs_df_per_scaffold = []
for scaffold in names_scaffolds:
# print(scaffold)
allele_freqs_df_per_scaffold.append(allele_freqs_df.loc[allele_freqs_df.scaffold==str(scaffold)])
# Obtain the set (list) of the common sites, among all populations, per scaffold
across_pop_sites_per_scaffold = shared_sites(file_list = filename, region = region, no_pops = no_pops, pooled = pooled, scaffold_specific = True, list_of_scaffolds=names_scaffolds)
# Then we loop (the sumstats calculation, in windows) over all scaffolds
output_names = []
for i in range(len(across_pop_sites_per_scaffold)):
# Define start and end indexes and positions per scaffold, as well as the sequence length from which to calculate summary statistics.
start_index = 0
end_index = len(across_pop_sites_per_scaffold[i]) - 1
start_position = across_pop_sites_per_scaffold[i][0]
end_position = across_pop_sites_per_scaffold[i][-1]
seq_length = len(across_pop_sites_per_scaffold[i])
allele_freqs_df_scaffold = allele_freqs_df_per_scaffold[i]
print("Calculating allele frequencies and summary statistics for scaffold: " + str(names_scaffolds[i]) + "...")
# Select window region from allele frequency dataframe and convert this into input format for LSD-High (lsd_high). Define region = "single" since we're calculating on a PER scaffold basis.
raw_allele_freqs = BAM2AlleleFreqs.mpileup2lsd(allele_freqs_df = allele_freqs_df_scaffold, no_pops = no_pops, region = "single", window_size = window_size, start_position = start_position, end_position = end_position)
# Calculate the summary statistics
sumstatscalc.sumstats_calc (raw_allele_freqs, pop_size = pop_size, seq_length = seq_length, span = span, polarised = polarised, output = output_format, plotSFS = plotSFS, plotPi = plotPi)
# Output summary statistic files
output_name = output_prefix + "_" + names_scaffolds[i] + ".obs"
output_names.append(output_name)
os.rename("summary_stats_temp.txt", output_name)
# Should you wish to output the sumstats of the multiple scaffolds to the same file (instead of one output file per region), you can include the code below. Default setting as is.
if region == "multiple":
with open (output_prefix + "_start" + str(across_pop_sites_all[0]) + "_end" + str(across_pop_sites_all[-1]) + "_M" + str(min_allele_count) + "_Q" + str(quality_filter) + ".obs", "w") as master_file:
for file in output_names[:1]:
f = open(file, "r")
master_file.write(f.read())
master_file.writelines("\n")
f.close
os.remove(file)
for file in output_names[1:]:
f = open(file, "r")
sumstat_line = f.read().splitlines()[1] + "\n"
master_file.writelines(sumstat_line)
f.close
os.remove(file)
if scaffold_specific == True:
with open(output_prefix + "_start" + str(across_pop_sites_all[0]) + "_end" + str(across_pop_sites_all[-1]) + "_M" + str(min_allele_count) + "_Q" + str(quality_filter) + ".scaffoldlist", "w") as names_file:
for scaffold_name in output_names:
names_file.write("%s\n" % scaffold_name)
elif window_size > 0:
# Obtain the set (list) of the common sites, among all populations
across_pop_sites_all = shared_sites(file_list = filename, region = region, no_pops = no_pops, pooled = pooled, scaffold_specific = False)
# Define overall start and end positions
if start_position == 0 and end_position == 0:
start_position = across_pop_sites_all[0]
end_position = across_pop_sites_all[-1]
# Generate total allele frequency dataframe, from a list of mpileup files
allele_freqs_df = BAM2AlleleFreqs.calc_pooled_allele_freqs_mpileup(file_list = filename, no_pops = no_pops, pooled = pooled, quality_filter = quality_filter, min_allele_count = min_allele_count, error_method = error_method, region = region, pop_size = pop_size, min_cov = min_cov, max_cov = max_cov, start_position = start_position, end_position = end_position)
if scaffold_specific == False:
start_index = 0
end_index = len(across_pop_sites_all) - 1
# Obtain the start and end positions of the desired windows, given a defined start position, end position, and window size. Accounts for missing bases in the scaffold (data).
positions = make_windows_fixedLength(across_pop_sites = across_pop_sites_all, region = region, start_position = start_position, end_position = end_position, start_index = start_index, end_index = end_index, window_size = window_size, window_step = window_step)
start_positions = positions["start_positions"]
end_positions = positions["end_positions"]
# Calculate summary statistics across windows, window-by-window
output_names = []
for start_pos, end_pos in zip(start_positions,end_positions):
print("Calculating allele frequencies and summary statistics for all scaffolds (merged); window " + str(start_pos) + "-" + str(end_pos) + "...")
# Select window region from allele frequency dataframe and convert this into input format for LSD-High (lsd_high)
raw_allele_freqs = BAM2AlleleFreqs.mpileup2lsd(allele_freqs_df = allele_freqs_df, no_pops = no_pops, region = region, window_size = window_size, start_position = start_pos, end_position = end_pos)
# Calculates sequence length from which to calculate summary statistics. This is the window size minus the missing bases
start_pos_index = across_pop_sites_all.index(start_pos)
end_pos_index = across_pop_sites_all.index(end_pos)
seq_length = len(across_pop_sites_all[start_pos_index:end_pos_index])
# Calculate the summary statistics
sumstatscalc.sumstats_calc (raw_allele_freqs, pop_size = pop_size, seq_length = seq_length, span = span, polarised = polarised, output = output_format, plotSFS = plotSFS, plotPi = plotPi)
# Output summary statistic files
output_name = output_prefix + "_window_" + str(start_pos) + "_" + str(end_pos) + ".obs"
output_names.append(output_name)
os.rename("summary_stats_temp.txt", output_name)
elif scaffold_specific == True:
# First we split dataframe into multiple scaffold-specific dataframes
allele_freqs_df.set_index(keys=['scaffold'], drop=False,inplace=True)
names_scaffolds = allele_freqs_df['scaffold'].unique().tolist()
# We can then loop over the scaffold-specific dataframes
allele_freqs_df_per_scaffold = []
for scaffold in names_scaffolds:
# print(scaffold)
allele_freqs_df_per_scaffold.append(allele_freqs_df.loc[allele_freqs_df.scaffold==str(scaffold)])
# Obtain the set (list) of the common sites, among all populations, per scaffold
across_pop_sites_per_scaffold = shared_sites(file_list = filename, region = region, no_pops = no_pops, pooled = pooled, scaffold_specific = True, list_of_scaffolds=names_scaffolds)
# Then we loop (the sumstats calculation, in windows) over all scaffolds
output_names = []
for i in range(len(across_pop_sites_per_scaffold)):
# Define start and end indexes and positions per scaffold
start_index = 0
end_index = len(across_pop_sites_per_scaffold[i]) - 1
start_position = across_pop_sites_per_scaffold[i][0]
end_position = across_pop_sites_per_scaffold[i][-1]
# Obtain the start and end positions of the desired windows, given a defined start position, end position, and window size, PER SCAFFOLD. Accounts for missing bases in the scaffold (data).
positions = make_windows_fixedLength(across_pop_sites = across_pop_sites_per_scaffold[i], region = region, start_position = start_position, end_position = end_position, start_index = start_index, end_index = end_index, window_size = window_size, window_step = window_step)
start_positions = positions["start_positions"]
end_positions = positions["end_positions"]
allele_freqs_df_scaffold = allele_freqs_df_per_scaffold[i]
# Calculate summary statistics across windows, window-by-window
for start_pos, end_pos in zip(start_positions,end_positions): # need to have scaffold specific start and end positions!
print("Calculating allele frequencies and summary statistics for scaffold: " + str(names_scaffolds[i]) + "; window " + str(start_pos) + "-" + str(end_pos) + "...")
# Select window region from allele frequency dataframe and convert this into input format for LSD-High (lsd_high). Define region = "single" since we're calculating on a PER scaffold basis.
raw_allele_freqs = BAM2AlleleFreqs.mpileup2lsd(allele_freqs_df = allele_freqs_df_scaffold, no_pops = no_pops, region = "single", window_size = window_size, start_position = start_pos, end_position = end_pos)
# Calculates sequence length from which to calculate summary statistics. This is the window size minus the missing bases
start_pos_index = across_pop_sites_per_scaffold[i].index(start_pos)
end_pos_index = across_pop_sites_per_scaffold[i].index(end_pos)
seq_length = len(across_pop_sites_per_scaffold[i][start_pos_index:end_pos_index])
# Calculate the summary statistics
sumstatscalc.sumstats_calc (raw_allele_freqs, pop_size = pop_size, seq_length = seq_length, span = span, polarised = polarised, output = output_format, plotSFS = plotSFS, plotPi = plotPi)
# Output summary statistic files
output_name = output_prefix + "_" + names_scaffolds[i] + "_window_" + str(start_pos) + "_" + str(end_pos) + ".obs"
output_names.append(output_name)
os.rename("summary_stats_temp.txt", output_name)
# Should you wish to output the sumstats of the multiple windows and scaffolds to the same file (instead of one output file per region), you can include the code below. Default setting as is.
with open (output_prefix + "_start" + str(across_pop_sites_all[0]) + "_end" + str(across_pop_sites_all[-1]) + "_windowsize" + str(window_size) + "_M" + str(min_allele_count) + "_Q" + str(quality_filter) + ".obs", "w") as master_file:
for file in output_names[:1]:
f = open(file, "r")
master_file.write(f.read())
master_file.writelines("\n")
f.close
os.remove(file)
for file in output_names[1:]:
f = open(file, "r")
sumstat_line = f.read().splitlines()[1] + "\n"
master_file.writelines(sumstat_line)
f.close
os.remove(file)
with open(output_prefix + "_start" + str(across_pop_sites_all[0]) + "_end" + str(across_pop_sites_all[-1]) + "_windowsize" + str(window_size) + "_M" + str(min_allele_count) + "_Q" + str(quality_filter) + ".scaffoldlist", "w") as names_file:
for scaffold_name in output_names:
names_file.write("%s\n" % scaffold_name)
else:
print("Window size must be defined as 0 or a positive integer")
endTime = datetime.now()
total_runtime = "Total runtime:" + str(endTime - startTime)
print(total_runtime)
#%%