From 406df2aa1fa98fe542f118a183a016e0fd4eeca2 Mon Sep 17 00:00:00 2001 From: Johannes Groos Date: Tue, 12 Jul 2022 16:01:26 +0200 Subject: [PATCH 1/3] Replaced dictionary with dataframe; added a small statistic --- .gitignore | 6 + .../code_for_reference/create_annoted_ref.py | 29 ++-- vxdetector/Output_counter.py | 130 ++++++++---------- vxdetector/VXdetector.py | 62 ++++++++- 4 files changed, 130 insertions(+), 97 deletions(-) diff --git a/.gitignore b/.gitignore index 6bc33eb..4956909 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ .coverage coverage.lcov +bowtie2.1.bt2 +bowtie2.2.bt2 +bowtie2.3.bt2 +bowtie2.4.bt2 +bowtie2.rev.1.bt2 +bowtie2.rev.2.bt2 diff --git a/Indexed_bt2/code_for_reference/create_annoted_ref.py b/Indexed_bt2/code_for_reference/create_annoted_ref.py index d19bde5..257e88b 100644 --- a/Indexed_bt2/code_for_reference/create_annoted_ref.py +++ b/Indexed_bt2/code_for_reference/create_annoted_ref.py @@ -7,7 +7,18 @@ from os import path as p -def write(region, seq_chrom, annoted_ref): +region = {'V1_start': '', 'V1_end': '', + 'V2_start': '', 'V2_end': '', + 'V3_start': '', 'V3_end': '', + 'V4_start': '', 'V4_end': '', + 'V5_start': '', 'V5_end': '', + 'V6_start': '', 'V6_end': '', + 'V7_start': '', 'V7_end': '', + 'V8_start': '', 'V8_end': '', + 'V9_start': '', 'V9_end': ''} + + +def write(seq_chrom, annoted_ref): with open(annoted_ref, 'a') as t: if region['V1_start'] != '': for counter, key in enumerate(region): @@ -20,7 +31,7 @@ def write(region, seq_chrom, annoted_ref): t.write(words) -def index(line, region): +def index(line): boundary = {'V1_start': 189, 'V1_end': 471, 'V2_start': 485, 'V2_end': 1867, 'V3_start': 1915, 'V3_end': 2231, @@ -36,7 +47,6 @@ def index(line, region): list1 = ''.join(str(e) for e in list1) list1 = list1.replace('-', '') region[key] = int(list1.index('1')) + 1 - return region def main(): @@ -44,21 +54,12 @@ def main(): annoted_ref = f'{p.dirname(p.dirname(__file__))}/annoted_ref.bed' with open(file_, 'r') as f: for line in f: - region = {'V1_start': '', 'V1_end': '', - 'V2_start': '', 'V2_end': '', - 'V3_start': '', 'V3_end': '', - 'V4_start': '', 'V4_end': '', - 'V5_start': '', 'V5_end': '', - 'V6_start': '', 'V6_end': '', - 'V7_start': '', 'V7_end': '', - 'V8_start': '', 'V8_end': '', - 'V9_start': '', 'V9_end': ''} if line.startswith('>'): seq_chrom = line[1:] seq_chrom = seq_chrom.strip('\n') else: - index(line, region) - write(region, seq_chrom, annoted_ref) + index(line) + write(seq_chrom, annoted_ref) if __name__ == '__main__': diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py index 9abaf31..c2414dd 100644 --- a/vxdetector/Output_counter.py +++ b/vxdetector/Output_counter.py @@ -1,11 +1,18 @@ #!/usr/bin/python import os -import csv +import pandas as pd from itertools import (takewhile, repeat) -regions = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0, - 'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0} + +new_row = pd.DataFrame({'Read-file': [''], 'Number of Reads': [0], + 'Unaligned Reads [%]': [0], + 'Not properly paired': ['unpaired'], + 'Sequenced variable region': ['V'], + 'V1': [0], 'V2': [0], 'V3': [0], + 'V4': [0], 'V5': [0], 'V6': [0], + 'V7': [0], 'V8': [0], 'V9': [0], + 'Not aligned to a variable region': [0]}) # declares the dictionary containing all variable regions as # a global variable. That way it can be easily accessed and modified @@ -36,92 +43,61 @@ def rawincount(filename): return sum(buf.count(b'\n') for buf in bufgen) -def region_count(unaligned_count, temp_path, mode, all_reads): +def region_count(temp_path, mode): if mode == 'paired': - unpaired = round(rawincount(f'{temp_path}bed.log')/all_reads, 2) + new_row['Not properly paired'] = rawincount( + f'{temp_path}bed.log') \ + / new_row['Number of Reads'] # counts the amount of error messages given by bedtools # during bam to bed conversion each error message reprensents # one improperly paired read - elif mode == 'unpaired': - unpaired = 'unpaired Reads' - no_V = rawincount(f'{temp_path}noOver.bed') + new_row['Not aligned to a variable region'] = rawincount( + f'{temp_path}noOver.bed') # counts the number of reads not mapped to any variable region - aligned_count = 100 - unaligned_count - count = sum([regions['V1'], regions['V2'], regions['V3'], - regions['V4'], regions['V5'], regions['V6'], - regions['V7'], regions['V8'], regions['V9'], no_V]) - for key in regions: - regions[key] = (regions[key] / count) * aligned_count - no_V = (no_V/count)*aligned_count + aligned_count = 100 - new_row.iat[0, 2] + count = sum(new_row.iloc[0, 5:]) + for column in new_row.columns[5:]: + new_row[column] = (new_row[column] / count) * aligned_count # Calculates the percantage of reads mapped to either a specific # region or no region - if aligned_count == 0 or (no_V/aligned_count) == 1: - most_probable_V = 'No variable Region' + if aligned_count == 0 or (new_row.iat[0, 14]/aligned_count) == 1: + new_row['Sequenced variable region'] = 'No variable Region' else: - most_probable_V = [x[0].replace('V', '') for x in - sorted(list(regions.items())) - if ((x[1] / aligned_count) * 100) > 20] - most_probable_V = 'V' + ''.join(most_probable_V) - if most_probable_V == 'V': - most_probable_V = 'No variable Region' + most_probable_V = [x.replace('V', '') for x in + new_row.columns[5:14] + if ((new_row.at[0, x] / aligned_count) * 100) > 20] + new_row['Sequenced variable region'] = 'V' + ''.join(most_probable_V) + if new_row.iat[0, 4] == 'V': + new_row['Sequenced variable region'] = 'No variable Region' # finds the variable regions with the highest probability # any region having a percentage of 20% or higher will be listed. # The percentage is calculated within the regions. - return most_probable_V, unpaired, no_V - -def create_output(path, file_name, unaligned_count, dir_name, - dir_path, temp_path, all_reads, mode): +def create_row(path, file_name, dir_name, dir_path, temp_path, mode): file_name, dir_name, dir_path = directory_navi(file_name, path, dir_name, dir_path) if mode == 'paired': - file_name = file_name.replace('_R1_001', '') + new_row['Read-file'] = file_name.replace('_R1_001', '') new_file = f'{dir_path}/{dir_name}.csv' # The new file is named after the directory containing the reads - most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path, - mode, all_reads) - if os.path.exists(new_file): - header = True - else: - header = False - # The header will only be written if it wasnt written before - foo = {'file_name': file_name, 'all_reads': all_reads, - 'unaligned_count': unaligned_count, 'unpaired': unpaired, - 'no_V': no_V, 'most_probable_V': most_probable_V} - foo.update(regions) - with open(new_file, 'a', newline='') as o: - fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]', - 'Not properly paired', 'Sequenced variable region', 'V1', - 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', - 'Not aligned to a variable region'] - writer = csv.DictWriter(o, fieldnames=fieldnames) - if header is False: - writer.writeheader() - for key in foo: - if isinstance(foo[key], float): - foo[key] = round(foo[key], 2) - writer.writerow({'Read-file': foo['file_name'], 'Number of Reads': - foo['all_reads'], 'Unaligned Reads [%]': - foo['unaligned_count'], 'Not properly paired': - foo['unpaired'], 'Sequenced variable region': - foo['most_probable_V'], 'V1': foo['V1'], - 'V2': foo['V2'], 'V3': foo['V3'], - 'V4': foo['V4'], 'V5': foo['V5'], - 'V6': foo['V6'], 'V7': foo['V7'], - 'V8': foo['V8'], 'V9': foo['V9'], - 'Not aligned to a variable region': foo['no_V']}) - - -def print_output(unaligned_count, temp_path, mode): - most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path, - mode, all_reads='') - print(f'\n{str(unaligned_count)}% were unaligned.') - print(f'Of all the aligned Reads most were aligned to: {most_probable_V}') + region_count(temp_path, mode) + for column in new_row: + if isinstance(new_row.at[0, column], float): + new_row.at[0, column] = round(new_row.at[0, column], 2) + return new_file + + +def print_output(temp_path, mode): + region_count(temp_path, mode) + print(f"\n{round(new_row.iat[0, 2], 2)}% were unaligned.") + print(f"Of all the aligned Reads most were aligned to: \ + {new_row.iat[0, 4]}") print('The probabilities of all regions is as follows [%]:') - print('\n'.join(':'.join((key, str(round(val, 2)))) - for (key, val) in regions.items())) - print(f'\nAnd {round(no_V, 2)}% were not mapped to any variable region\n') + print('\n'.join(':'.join((column, str(round(new_row.at[0, column], 2)))) + for column in new_row.columns[5:14])) + print(f"\nAnd {round(new_row.iat[0, -1], 2)}% were not mapped \ + to any variable region\n") def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode): @@ -130,20 +106,22 @@ def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode): with open(BED_path, 'r') as bed: for line in bed: line_list = line.split('\t') - regions[line_list[-1].strip('\n')] += 1 + new_row[line_list[-1].strip('\n')] += 1 # counts all appearing variable Regions with open(Log_path, 'r') as log: lines = log.readlines() - all_reads = int(lines[0].strip('\n\t ').split()[0]) + new_row['Number of Reads'] = int(lines[0].strip('\n\t ').split()[0]) unaligned_count = lines[-1].strip('\n\t ') # Reads the bowtie2 stdout which is normally seen in the terminal unaligned_count = unaligned_count.split()[0].replace('%', '') - unaligned_count = 100 - float(unaligned_count) + new_row['Unaligned Reads [%]'] = 100 - float(unaligned_count) if file_type is None: - print_output(unaligned_count, temp_path, mode) + print_output(temp_path, mode) # should file_type be None only a single file was given # therefore output will be printed in the terminal rather # than in a file else: - create_output(path, file_name, unaligned_count, - dir_name, dir_path, temp_path, all_reads, mode) + new_file = create_row(path, file_name, dir_name, dir_path, + temp_path, mode) + if file_type is not None: + return new_row, new_file diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 91ba75e..cf3b943 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -6,10 +6,37 @@ import interact_bedtools import Output_counter import files_manager +import pandas as pd +output = pd.DataFrame({'Read-file': [], 'Number of Reads': [], + 'Unaligned Reads [%]': [], + 'Not properly paired': [], + 'Sequenced variable region': [], + 'V1': [], 'V2': [], 'V3': [], + 'V4': [], 'V5': [], 'V6': [], + 'V7': [], 'V8': [], 'V9': [], + 'Not aligned to a variable region': []}) -def workflow(path, temp_path, file_path, file_type, file_name, dir_name, - dir_path, mode, read2_file): + +def do_statistic(): + global output + avarage = output.mean(numeric_only=True).round(2).to_frame().T + avarage['Read-file'] = ['Average'] + avarage['Sequenced variable region'] = (output.iloc[:, [4]]. + mode().values.tolist()) + std_dev = output.std(numeric_only=True).round(2).to_frame().T + std_dev['Read-file'] = ['Standard deviation'] + statistic = pd.concat([avarage, std_dev], axis=0) + statistic = statistic[['Read-file', 'Number of Reads', + 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', + 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', + 'Not aligned to a variable region']] + output = pd.concat([statistic, output], axis=0) + + +def workflow(path, temp_path, file_path, file_type, old_file_name, + file_name, dir_name, dir_path, mode, read2_file): interact_bowtie2.buildbowtie2(path) if file_type is not None: aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file, @@ -23,8 +50,22 @@ def workflow(path, temp_path, file_path, file_type, file_name, dir_name, # 16S database. interact_bedtools.overlap(path, temp_path, aligned_path) # look which reads intersect with which variable Region - Output_counter.count(temp_path, file_name, file_type, path, - dir_name, dir_path, mode) + if file_type is not None: + global output + new_row, new_file = Output_counter.count(temp_path, file_name, + file_type, path, dir_name, + dir_path, mode) + if new_file != old_file_name and old_file_name is not None: + output.sort_values(by=['Read-file'], inplace=True) + do_statistic() + output.to_csv(old_file_name, index=False) + output = output.iloc[0:0] + output = pd.concat([output, new_row], axis=0) + old_file_name = new_file + return new_file, old_file_name + else: + Output_counter.count(temp_path, file_name, file_type, path, + dir_name, dir_path, mode) # counts the Variable Regions that are found with bedtools and prints the # highest probable variable Region (single file) or # writes a new file (directory) @@ -50,6 +91,7 @@ def main(): file_type = None fasta_ext = ('.fasta', '.fa', '.ffa', '.ffn', '.faa', '.frn') fastq_ext = ('.fq', '.fastq',) + old_file_name = None if args.fasta_file is None: # If a directory was given as input the programm will walk through # that directory and search for forward reads. @@ -82,16 +124,22 @@ def main(): file_type = ' -f' else: continue - workflow(path, temp_path, file_path, file_type, file_name, - dir_name, args.dir_path, mode, read2_file) + new_file, old_file_name = workflow(path, temp_path, file_path, + file_type, old_file_name, + file_name, dir_name, + args.dir_path, mode, + read2_file) if file_type is None: print('There were no FASTA or FASTQ files with 16S sequences ' 'in this directory') else: - workflow(path, temp_path, args.fasta_file, file_type, + workflow(path, temp_path, args.fasta_file, file_type, old_file_name, file_name='Your file', dir_name='', dir_path='', mode='unpaired', read2_file='') files_manager.tmp_dir(path, temp_path) + if old_file_name is not None: + do_statistic() + output.to_csv(old_file_name, index=False) if __name__ == '__main__': From 30919e49dc37fdf7a41dead76a20e85d3ab0e5ff Mon Sep 17 00:00:00 2001 From: Johannes Groos Date: Wed, 13 Jul 2022 22:34:55 +0200 Subject: [PATCH 2/3] Improved the way the average most likely variable region is shown; fixed an error where giving a folder directly containing reads doesnt give an Output file. --- vxdetector/Output_counter.py | 4 ++-- vxdetector/VXdetector.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py index c2414dd..118ec14 100644 --- a/vxdetector/Output_counter.py +++ b/vxdetector/Output_counter.py @@ -19,6 +19,8 @@ def directory_navi(file_name, path, dir_name, dir_path): dir_path = dir_name.replace(dir_path, '', 1) + if dir_name.endswith('/'): + dir_name = dir_name.rstrip('/') # only leaves the part in the directory tree between # the file and the original given directory path dir_name = os.path.basename(dir_name) @@ -31,7 +33,6 @@ def directory_navi(file_name, path, dir_name, dir_path): file_name = file_name.rsplit('.', 2)[0] else: file_name = file_name.rsplit('.', 1)[0] - return file_name, dir_name, dir_path @@ -39,7 +40,6 @@ def rawincount(filename): f = open(filename, 'rb') bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None))) - return sum(buf.count(b'\n') for buf in bufgen) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index cf3b943..c0a930b 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -22,8 +22,12 @@ def do_statistic(): global output avarage = output.mean(numeric_only=True).round(2).to_frame().T avarage['Read-file'] = ['Average'] - avarage['Sequenced variable region'] = (output.iloc[:, [4]]. - mode().values.tolist()) + region = (output.iloc[:, [4]].mode().values) + region = ' / '.join(str(r) for r in region) + region = region.replace('\'', '') + region = region.replace('[', '') + region = region.replace(']', '') + avarage['Sequenced variable region'] = region std_dev = output.std(numeric_only=True).round(2).to_frame().T std_dev['Read-file'] = ['Standard deviation'] statistic = pd.concat([avarage, std_dev], axis=0) From d719271c2f83c4bb35a71e1b1b664e0f7cf04605 Mon Sep 17 00:00:00 2001 From: Johannes Groos Date: Thu, 14 Jul 2022 09:50:17 +0200 Subject: [PATCH 3/3] added pandas as a dependency --- README.md | 1 + environment.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 7c4b084..12fc82f 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,4 @@ Requirements: bowtie2 samtools bedtools + Module: pandas diff --git a/environment.yml b/environment.yml index 32ec06c..ee1108e 100644 --- a/environment.yml +++ b/environment.yml @@ -7,3 +7,4 @@ dependencies: - flake8 - nose - coverage >= 6 # to ensure lcov option is available + - pandas