From 406df2aa1fa98fe542f118a183a016e0fd4eeca2 Mon Sep 17 00:00:00 2001
From: Johannes Groos <jgroos@algbi-stud-pc04.desktop>
Date: Tue, 12 Jul 2022 16:01:26 +0200
Subject: [PATCH 1/3] Replaced dictionary with dataframe; added a small
 statistic

---
 .gitignore                                    |   6 +
 .../code_for_reference/create_annoted_ref.py  |  29 ++--
 vxdetector/Output_counter.py                  | 130 ++++++++----------
 vxdetector/VXdetector.py                      |  62 ++++++++-
 4 files changed, 130 insertions(+), 97 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6bc33eb..4956909 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,8 @@
 .coverage
 coverage.lcov
+bowtie2.1.bt2
+bowtie2.2.bt2
+bowtie2.3.bt2
+bowtie2.4.bt2
+bowtie2.rev.1.bt2
+bowtie2.rev.2.bt2
diff --git a/Indexed_bt2/code_for_reference/create_annoted_ref.py b/Indexed_bt2/code_for_reference/create_annoted_ref.py
index d19bde5..257e88b 100644
--- a/Indexed_bt2/code_for_reference/create_annoted_ref.py
+++ b/Indexed_bt2/code_for_reference/create_annoted_ref.py
@@ -7,7 +7,18 @@
 from os import path as p
 
 
-def write(region, seq_chrom, annoted_ref):
+region = {'V1_start': '', 'V1_end': '',
+          'V2_start': '', 'V2_end': '',
+          'V3_start': '', 'V3_end': '',
+          'V4_start': '', 'V4_end': '',
+          'V5_start': '', 'V5_end': '',
+          'V6_start': '', 'V6_end': '',
+          'V7_start': '', 'V7_end': '',
+          'V8_start': '', 'V8_end': '',
+          'V9_start': '', 'V9_end': ''}
+
+
+def write(seq_chrom, annoted_ref):
     with open(annoted_ref, 'a') as t:
         if region['V1_start'] != '':
             for counter, key in enumerate(region):
@@ -20,7 +31,7 @@ def write(region, seq_chrom, annoted_ref):
                     t.write(words)
 
 
-def index(line, region):
+def index(line):
     boundary = {'V1_start': 189, 'V1_end': 471,
                 'V2_start': 485, 'V2_end': 1867,
                 'V3_start': 1915, 'V3_end': 2231,
@@ -36,7 +47,6 @@ def index(line, region):
         list1 = ''.join(str(e) for e in list1)
         list1 = list1.replace('-', '')
         region[key] = int(list1.index('1')) + 1
-    return region
 
 
 def main():
@@ -44,21 +54,12 @@ def main():
     annoted_ref = f'{p.dirname(p.dirname(__file__))}/annoted_ref.bed'
     with open(file_, 'r') as f:
         for line in f:
-            region = {'V1_start': '', 'V1_end': '',
-                      'V2_start': '', 'V2_end': '',
-                      'V3_start': '', 'V3_end': '',
-                      'V4_start': '', 'V4_end': '',
-                      'V5_start': '', 'V5_end': '',
-                      'V6_start': '', 'V6_end': '',
-                      'V7_start': '', 'V7_end': '',
-                      'V8_start': '', 'V8_end': '',
-                      'V9_start': '', 'V9_end': ''}
             if line.startswith('>'):
                 seq_chrom = line[1:]
                 seq_chrom = seq_chrom.strip('\n')
             else:
-                index(line, region)
-                write(region, seq_chrom, annoted_ref)
+                index(line)
+                write(seq_chrom, annoted_ref)
 
 
 if __name__ == '__main__':
diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py
index 9abaf31..c2414dd 100644
--- a/vxdetector/Output_counter.py
+++ b/vxdetector/Output_counter.py
@@ -1,11 +1,18 @@
 #!/usr/bin/python
 
 import os
-import csv
+import pandas as pd
 from itertools import (takewhile, repeat)
 
-regions = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0,
-           'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0}
+
+new_row = pd.DataFrame({'Read-file': [''], 'Number of Reads': [0],
+                        'Unaligned Reads [%]': [0],
+                        'Not properly paired': ['unpaired'],
+                        'Sequenced variable region': ['V'],
+                        'V1': [0], 'V2': [0], 'V3': [0],
+                        'V4': [0], 'V5': [0], 'V6': [0],
+                        'V7': [0], 'V8': [0], 'V9': [0],
+                        'Not aligned to a variable region': [0]})
 # declares the dictionary containing all variable regions as
 # a global variable. That way it can be easily accessed and modified
 
@@ -36,92 +43,61 @@ def rawincount(filename):
     return sum(buf.count(b'\n') for buf in bufgen)
 
 
-def region_count(unaligned_count, temp_path, mode, all_reads):
+def region_count(temp_path, mode):
     if mode == 'paired':
-        unpaired = round(rawincount(f'{temp_path}bed.log')/all_reads, 2)
+        new_row['Not properly paired'] = rawincount(
+                                         f'{temp_path}bed.log') \
+                                         / new_row['Number of Reads']
         # counts the amount of error messages given by bedtools
         # during bam to bed conversion each error message reprensents
         # one improperly paired read
-    elif mode == 'unpaired':
-        unpaired = 'unpaired Reads'
-    no_V = rawincount(f'{temp_path}noOver.bed')
+    new_row['Not aligned to a variable region'] = rawincount(
+                                                  f'{temp_path}noOver.bed')
     # counts the number of reads not mapped to any variable region
-    aligned_count = 100 - unaligned_count
-    count = sum([regions['V1'], regions['V2'], regions['V3'],
-                regions['V4'], regions['V5'], regions['V6'],
-                regions['V7'], regions['V8'], regions['V9'], no_V])
-    for key in regions:
-        regions[key] = (regions[key] / count) * aligned_count
-    no_V = (no_V/count)*aligned_count
+    aligned_count = 100 - new_row.iat[0, 2]
+    count = sum(new_row.iloc[0, 5:])
+    for column in new_row.columns[5:]:
+        new_row[column] = (new_row[column] / count) * aligned_count
     # Calculates the percantage of reads mapped to either a specific
     # region or no region
-    if aligned_count == 0 or (no_V/aligned_count) == 1:
-        most_probable_V = 'No variable Region'
+    if aligned_count == 0 or (new_row.iat[0, 14]/aligned_count) == 1:
+        new_row['Sequenced variable region'] = 'No variable Region'
     else:
-        most_probable_V = [x[0].replace('V', '') for x in
-                           sorted(list(regions.items()))
-                           if ((x[1] / aligned_count) * 100) > 20]
-        most_probable_V = 'V' + ''.join(most_probable_V)
-        if most_probable_V == 'V':
-            most_probable_V = 'No variable Region'
+        most_probable_V = [x.replace('V', '') for x in
+                           new_row.columns[5:14]
+                           if ((new_row.at[0, x] / aligned_count) * 100) > 20]
+        new_row['Sequenced variable region'] = 'V' + ''.join(most_probable_V)
+        if new_row.iat[0, 4] == 'V':
+            new_row['Sequenced variable region'] = 'No variable Region'
     # finds the variable regions with the highest probability
     # any region having a percentage of 20% or higher will be listed.
     # The percentage is calculated within the regions.
 
-    return most_probable_V, unpaired, no_V
-
 
-def create_output(path, file_name, unaligned_count, dir_name,
-                  dir_path, temp_path, all_reads, mode):
+def create_row(path, file_name, dir_name, dir_path, temp_path, mode):
     file_name, dir_name, dir_path = directory_navi(file_name, path,
                                                    dir_name, dir_path)
     if mode == 'paired':
-        file_name = file_name.replace('_R1_001', '')
+        new_row['Read-file'] = file_name.replace('_R1_001', '')
     new_file = f'{dir_path}/{dir_name}.csv'
     # The new file is named after the directory containing the reads
-    most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
-                                                   mode, all_reads)
-    if os.path.exists(new_file):
-        header = True
-    else:
-        header = False
-    # The header will only be written if it wasnt written before
-    foo = {'file_name': file_name, 'all_reads': all_reads,
-           'unaligned_count': unaligned_count, 'unpaired': unpaired,
-           'no_V': no_V, 'most_probable_V': most_probable_V}
-    foo.update(regions)
-    with open(new_file, 'a', newline='') as o:
-        fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]',
-                      'Not properly paired', 'Sequenced variable region', 'V1',
-                      'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
-                      'Not aligned to a variable region']
-        writer = csv.DictWriter(o, fieldnames=fieldnames)
-        if header is False:
-            writer.writeheader()
-        for key in foo:
-            if isinstance(foo[key], float):
-                foo[key] = round(foo[key], 2)
-        writer.writerow({'Read-file': foo['file_name'], 'Number of Reads':
-                         foo['all_reads'], 'Unaligned Reads [%]':
-                         foo['unaligned_count'], 'Not properly paired':
-                         foo['unpaired'], 'Sequenced variable region':
-                         foo['most_probable_V'], 'V1': foo['V1'],
-                         'V2': foo['V2'], 'V3': foo['V3'],
-                         'V4': foo['V4'], 'V5': foo['V5'],
-                         'V6': foo['V6'], 'V7': foo['V7'],
-                         'V8': foo['V8'], 'V9': foo['V9'],
-                         'Not aligned to a variable region': foo['no_V']})
-
-
-def print_output(unaligned_count, temp_path, mode):
-    most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
-                                                   mode, all_reads='')
-    print(f'\n{str(unaligned_count)}% were unaligned.')
-    print(f'Of all the aligned Reads most were aligned to: {most_probable_V}')
+    region_count(temp_path, mode)
+    for column in new_row:
+        if isinstance(new_row.at[0, column], float):
+            new_row.at[0, column] = round(new_row.at[0, column], 2)
+    return new_file
+
+
+def print_output(temp_path, mode):
+    region_count(temp_path, mode)
+    print(f"\n{round(new_row.iat[0, 2], 2)}% were unaligned.")
+    print(f"Of all the aligned Reads most were aligned to: \
+          {new_row.iat[0, 4]}")
     print('The probabilities of all regions is as follows [%]:')
-    print('\n'.join(':'.join((key, str(round(val, 2))))
-          for (key, val) in regions.items()))
-    print(f'\nAnd {round(no_V, 2)}% were not mapped to any variable region\n')
+    print('\n'.join(':'.join((column, str(round(new_row.at[0, column], 2))))
+          for column in new_row.columns[5:14]))
+    print(f"\nAnd {round(new_row.iat[0, -1], 2)}% were not mapped \
+          to any variable region\n")
 
 
 def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
@@ -130,20 +106,22 @@ def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
     with open(BED_path, 'r') as bed:
         for line in bed:
             line_list = line.split('\t')
-            regions[line_list[-1].strip('\n')] += 1
+            new_row[line_list[-1].strip('\n')] += 1
             # counts all appearing variable Regions
     with open(Log_path, 'r') as log:
         lines = log.readlines()
-        all_reads = int(lines[0].strip('\n\t ').split()[0])
+        new_row['Number of Reads'] = int(lines[0].strip('\n\t ').split()[0])
         unaligned_count = lines[-1].strip('\n\t ')
         # Reads the bowtie2 stdout which is normally seen in the terminal
     unaligned_count = unaligned_count.split()[0].replace('%', '')
-    unaligned_count = 100 - float(unaligned_count)
+    new_row['Unaligned Reads [%]'] = 100 - float(unaligned_count)
     if file_type is None:
-        print_output(unaligned_count, temp_path, mode)
+        print_output(temp_path, mode)
         # should file_type be None only a single file was given
         # therefore output will be printed in the terminal rather
         # than in a file
     else:
-        create_output(path, file_name, unaligned_count,
-                      dir_name, dir_path, temp_path, all_reads, mode)
+        new_file = create_row(path, file_name, dir_name, dir_path,
+                              temp_path, mode)
+    if file_type is not None:
+        return new_row, new_file
diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 91ba75e..cf3b943 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -6,10 +6,37 @@
 import interact_bedtools
 import Output_counter
 import files_manager
+import pandas as pd
 
+output = pd.DataFrame({'Read-file': [], 'Number of Reads': [],
+                       'Unaligned Reads [%]': [],
+                       'Not properly paired': [],
+                       'Sequenced variable region': [],
+                       'V1': [], 'V2': [], 'V3': [],
+                       'V4': [], 'V5': [], 'V6': [],
+                       'V7': [], 'V8': [], 'V9': [],
+                       'Not aligned to a variable region': []})
 
-def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
-             dir_path, mode, read2_file):
+
+def do_statistic():
+    global output
+    avarage = output.mean(numeric_only=True).round(2).to_frame().T
+    avarage['Read-file'] = ['Average']
+    avarage['Sequenced variable region'] = (output.iloc[:, [4]].
+                                            mode().values.tolist())
+    std_dev = output.std(numeric_only=True).round(2).to_frame().T
+    std_dev['Read-file'] = ['Standard deviation']
+    statistic = pd.concat([avarage, std_dev], axis=0)
+    statistic = statistic[['Read-file', 'Number of Reads',
+                           'Unaligned Reads [%]', 'Not properly paired',
+                           'Sequenced variable region', 'V1', 'V2', 'V3',
+                           'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
+                           'Not aligned to a variable region']]
+    output = pd.concat([statistic, output], axis=0)
+
+
+def workflow(path, temp_path, file_path, file_type, old_file_name,
+             file_name, dir_name, dir_path, mode, read2_file):
     interact_bowtie2.buildbowtie2(path)
     if file_type is not None:
         aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file,
@@ -23,8 +50,22 @@ def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
     # 16S database.
     interact_bedtools.overlap(path, temp_path, aligned_path)
     # look which reads intersect with which variable Region
-    Output_counter.count(temp_path, file_name, file_type, path,
-                         dir_name, dir_path, mode)
+    if file_type is not None:
+        global output
+        new_row, new_file = Output_counter.count(temp_path, file_name,
+                                                 file_type, path, dir_name,
+                                                 dir_path, mode)
+        if new_file != old_file_name and old_file_name is not None:
+            output.sort_values(by=['Read-file'], inplace=True)
+            do_statistic()
+            output.to_csv(old_file_name, index=False)
+            output = output.iloc[0:0]
+        output = pd.concat([output, new_row], axis=0)
+        old_file_name = new_file
+        return new_file, old_file_name
+    else:
+        Output_counter.count(temp_path, file_name, file_type, path,
+                             dir_name, dir_path, mode)
     # counts the Variable Regions that are found with bedtools and prints the
     # highest probable variable Region (single file) or
     # writes a new file (directory)
@@ -50,6 +91,7 @@ def main():
     file_type = None
     fasta_ext = ('.fasta', '.fa', '.ffa', '.ffn', '.faa', '.frn')
     fastq_ext = ('.fq', '.fastq',)
+    old_file_name = None
     if args.fasta_file is None:
         # If a directory was given as input the programm will walk through
         # that directory and search for forward reads.
@@ -82,16 +124,22 @@ def main():
                     file_type = ' -f'
                 else:
                     continue
-                workflow(path, temp_path, file_path, file_type, file_name,
-                         dir_name, args.dir_path, mode, read2_file)
+                new_file, old_file_name = workflow(path, temp_path, file_path,
+                                                   file_type, old_file_name,
+                                                   file_name, dir_name,
+                                                   args.dir_path, mode,
+                                                   read2_file)
         if file_type is None:
             print('There were no FASTA or FASTQ files with 16S sequences '
                   'in this directory')
     else:
-        workflow(path, temp_path, args.fasta_file, file_type,
+        workflow(path, temp_path, args.fasta_file, file_type, old_file_name,
                  file_name='Your file', dir_name='', dir_path='',
                  mode='unpaired', read2_file='')
     files_manager.tmp_dir(path, temp_path)
+    if old_file_name is not None:
+        do_statistic()
+        output.to_csv(old_file_name, index=False)
 
 
 if __name__ == '__main__':

From 30919e49dc37fdf7a41dead76a20e85d3ab0e5ff Mon Sep 17 00:00:00 2001
From: Johannes Groos <jgroos@hp-sl230-s3-r3.intra>
Date: Wed, 13 Jul 2022 22:34:55 +0200
Subject: [PATCH 2/3] Improved the way the average most likely variable region
 is shown; fixed an error where giving a folder directly containing reads
 doesnt give an Output file.

---
 vxdetector/Output_counter.py | 4 ++--
 vxdetector/VXdetector.py     | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py
index c2414dd..118ec14 100644
--- a/vxdetector/Output_counter.py
+++ b/vxdetector/Output_counter.py
@@ -19,6 +19,8 @@
 
 def directory_navi(file_name, path, dir_name, dir_path):
     dir_path = dir_name.replace(dir_path, '', 1)
+    if dir_name.endswith('/'):
+        dir_name = dir_name.rstrip('/')
     # only leaves the part in the directory tree between
     # the file and the original given directory path
     dir_name = os.path.basename(dir_name)
@@ -31,7 +33,6 @@ def directory_navi(file_name, path, dir_name, dir_path):
         file_name = file_name.rsplit('.', 2)[0]
     else:
         file_name = file_name.rsplit('.', 1)[0]
-
     return file_name, dir_name, dir_path
 
 
@@ -39,7 +40,6 @@ def rawincount(filename):
     f = open(filename, 'rb')
     bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024)
                                      for _ in repeat(None)))
-
     return sum(buf.count(b'\n') for buf in bufgen)
 
 
diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index cf3b943..c0a930b 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -22,8 +22,12 @@ def do_statistic():
     global output
     avarage = output.mean(numeric_only=True).round(2).to_frame().T
     avarage['Read-file'] = ['Average']
-    avarage['Sequenced variable region'] = (output.iloc[:, [4]].
-                                            mode().values.tolist())
+    region = (output.iloc[:, [4]].mode().values)
+    region = ' / '.join(str(r) for r in region)
+    region = region.replace('\'', '')
+    region = region.replace('[', '')
+    region = region.replace(']', '')
+    avarage['Sequenced variable region'] = region
     std_dev = output.std(numeric_only=True).round(2).to_frame().T
     std_dev['Read-file'] = ['Standard deviation']
     statistic = pd.concat([avarage, std_dev], axis=0)

From d719271c2f83c4bb35a71e1b1b664e0f7cf04605 Mon Sep 17 00:00:00 2001
From: Johannes Groos <jgroos@algbi-stud-pc04.desktop>
Date: Thu, 14 Jul 2022 09:50:17 +0200
Subject: [PATCH 3/3] added pandas as a dependency

---
 README.md       | 1 +
 environment.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 7c4b084..12fc82f 100644
--- a/README.md
+++ b/README.md
@@ -17,3 +17,4 @@ Requirements:
     bowtie2
     samtools
     bedtools
+    Module: pandas
diff --git a/environment.yml b/environment.yml
index 32ec06c..ee1108e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,3 +7,4 @@ dependencies:
   - flake8
   - nose
   - coverage >= 6  # to ensure lcov option is available
+  - pandas