jlab · JGroos-16 · Jul 12, 2022 · Jul 13, 2022 · Jul 14, 2022 · Jul 14, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,8 @@
 .coverage
 coverage.lcov
+bowtie2.1.bt2
+bowtie2.2.bt2
+bowtie2.3.bt2
+bowtie2.4.bt2
+bowtie2.rev.1.bt2
+bowtie2.rev.2.bt2
diff --git a/Indexed_bt2/code_for_reference/create_annoted_ref.py b/Indexed_bt2/code_for_reference/create_annoted_ref.py
@@ -7,7 +7,18 @@
 from os import path as p
 
 
-def write(region, seq_chrom, annoted_ref):
+region = {'V1_start': '', 'V1_end': '',
+          'V2_start': '', 'V2_end': '',
+          'V3_start': '', 'V3_end': '',
+          'V4_start': '', 'V4_end': '',
+          'V5_start': '', 'V5_end': '',
+          'V6_start': '', 'V6_end': '',
+          'V7_start': '', 'V7_end': '',
+          'V8_start': '', 'V8_end': '',
+          'V9_start': '', 'V9_end': ''}
+
+
+def write(seq_chrom, annoted_ref):
     with open(annoted_ref, 'a') as t:
         if region['V1_start'] != '':
             for counter, key in enumerate(region):
@@ -20,7 +31,7 @@ def write(region, seq_chrom, annoted_ref):
                     t.write(words)
 
 
-def index(line, region):
+def index(line):
     boundary = {'V1_start': 189, 'V1_end': 471,
                 'V2_start': 485, 'V2_end': 1867,
                 'V3_start': 1915, 'V3_end': 2231,
@@ -36,29 +47,19 @@ def index(line, region):
         list1 = ''.join(str(e) for e in list1)
         list1 = list1.replace('-', '')
         region[key] = int(list1.index('1')) + 1
-    return region
 
 
 def main():
     file_ = f'{p.dirname(p.dirname(__file__))}/85_otus_aligned.fasta'
     annoted_ref = f'{p.dirname(p.dirname(__file__))}/annoted_ref.bed'
     with open(file_, 'r') as f:
         for line in f:
-            region = {'V1_start': '', 'V1_end': '',
-                      'V2_start': '', 'V2_end': '',
-                      'V3_start': '', 'V3_end': '',
-                      'V4_start': '', 'V4_end': '',
-                      'V5_start': '', 'V5_end': '',
-                      'V6_start': '', 'V6_end': '',
-                      'V7_start': '', 'V7_end': '',
-                      'V8_start': '', 'V8_end': '',
-                      'V9_start': '', 'V9_end': ''}
             if line.startswith('>'):
                 seq_chrom = line[1:]
                 seq_chrom = seq_chrom.strip('\n')
             else:
-                index(line, region)
-                write(region, seq_chrom, annoted_ref)
+                index(line)
+                write(seq_chrom, annoted_ref)
 
 
 if __name__ == '__main__':

diff --git a/README.md b/README.md
@@ -17,3 +17,4 @@ Requirements:
     bowtie2
     samtools
     bedtools
+    Module: pandas
diff --git a/environment.yml b/environment.yml
@@ -7,3 +7,4 @@ dependencies:
   - flake8
   - nose
   - coverage >= 6  # to ensure lcov option is available
+  - pandas
diff --git a/setup.py b/setup.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+from setuptools import find_packages, setup
+
+classifiers = [
+    'Development Status :: 1 - Pre-Alpha',
+    'License :: OSI Approved :: BSD License',
+    'Environment :: Console',
+    'Topic :: Software Development :: Libraries :: Application Frameworks',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Bio-Informatics',
+    'Programming Language :: Python',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.3',
+    'Programming Language :: Python :: 3.4',
+    'Operating System :: Unix',
+    'Operating System :: POSIX',
+    'Operating System :: MacOS :: MacOS X',
+    'Operating System :: Microsoft :: Windows']
+
+
+description = 'VXdetector'
+
+
+setup(name='vxdetector',
+      version='1.0.0',
+      license='BSD',
+      #description=description,
+      #long_description=long_description,
+      #keywords=keywords,
+      classifiers=classifiers,
+      author="Johannes Groos",
+      author_email="Johannes.Groos@bio.uni-giessen.de",
+      maintainer="http://jlab.bio",
+      maintainer_email="stefan.m.janssen@gmail.com",
+      url='https://github.com/jlab/algorithm_vxdetector/',
+      test_suite='nose.collector',
+      packages=find_packages(),
+      install_requires=[
+          #'click >= 6',
+          #'scikit-bio >= 0.4.0',
+      ],
+      #extras_require={'test': ["nose", "pep8", "flake8"],
+      #                      'coverage': ["coverage"]}
+)
diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py
@@ -1,17 +1,26 @@
 #!/usr/bin/python
 
 import os
-import csv
+import pandas as pd
 from itertools import (takewhile, repeat)
 
-regions = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0,
-           'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0}
+
+new_row = pd.DataFrame({'Read-file': [''], 'Number of Reads': [0],
+                        'Unaligned Reads [%]': [0],
+                        'Not properly paired': ['unpaired'],
+                        'Sequenced variable region': ['V'],
+                        'V1': [0], 'V2': [0], 'V3': [0],
+                        'V4': [0], 'V5': [0], 'V6': [0],
+                        'V7': [0], 'V8': [0], 'V9': [0],
+                        'Not aligned to a variable region': [0]})
 # declares the dictionary containing all variable regions as
 # a global variable. That way it can be easily accessed and modified
 
 
 def directory_navi(file_name, path, dir_name, dir_path):
     dir_path = dir_name.replace(dir_path, '', 1)
+    if dir_name.endswith('/'):
+        dir_name = dir_name.rstrip('/')
     # only leaves the part in the directory tree between
     # the file and the original given directory path
     dir_name = os.path.basename(dir_name)
@@ -24,104 +33,71 @@ def directory_navi(file_name, path, dir_name, dir_path):
         file_name = file_name.rsplit('.', 2)[0]
     else:
         file_name = file_name.rsplit('.', 1)[0]
-
     return file_name, dir_name, dir_path
 
 
 def rawincount(filename):
     f = open(filename, 'rb')
     bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024)
                                      for _ in repeat(None)))
-
     return sum(buf.count(b'\n') for buf in bufgen)
 
 
-def region_count(unaligned_count, temp_path, mode, all_reads):
+def region_count(temp_path, mode):
     if mode == 'paired':
-        unpaired = round(rawincount(f'{temp_path}bed.log')/all_reads, 2)
+        new_row['Not properly paired'] = rawincount(
+                                         f'{temp_path}bed.log') \
+                                         / new_row['Number of Reads']
         # counts the amount of error messages given by bedtools
         # during bam to bed conversion each error message reprensents
         # one improperly paired read
-    elif mode == 'unpaired':
-        unpaired = 'unpaired Reads'
-    no_V = rawincount(f'{temp_path}noOver.bed')
+    new_row['Not aligned to a variable region'] = rawincount(
+                                                  f'{temp_path}noOver.bed')
     # counts the number of reads not mapped to any variable region
-    aligned_count = 100 - unaligned_count
-    count = sum([regions['V1'], regions['V2'], regions['V3'],
-                regions['V4'], regions['V5'], regions['V6'],
-                regions['V7'], regions['V8'], regions['V9'], no_V])
-    for key in regions:
-        regions[key] = (regions[key] / count) * aligned_count
-    no_V = (no_V/count)*aligned_count
+    aligned_count = 100 - new_row.iat[0, 2]
+    count = sum(new_row.iloc[0, 5:])
+    for column in new_row.columns[5:]:
+        new_row[column] = (new_row[column] / count) * aligned_count
     # Calculates the percantage of reads mapped to either a specific
     # region or no region
-    if aligned_count == 0 or (no_V/aligned_count) == 1:
-        most_probable_V = 'No variable Region'
+    if aligned_count == 0 or (new_row.iat[0, 14]/aligned_count) == 1:
+        new_row['Sequenced variable region'] = 'No variable Region'
     else:
-        most_probable_V = [x[0].replace('V', '') for x in
-                           sorted(list(regions.items()))
-                           if ((x[1] / aligned_count) * 100) > 20]
-        most_probable_V = 'V' + ''.join(most_probable_V)
-        if most_probable_V == 'V':
-            most_probable_V = 'No variable Region'
+        most_probable_V = [x.replace('V', '') for x in
+                           new_row.columns[5:14]
+                           if ((new_row.at[0, x] / aligned_count) * 100) > 20]
+        new_row['Sequenced variable region'] = 'V' + ''.join(most_probable_V)
+        if new_row.iat[0, 4] == 'V':
+            new_row['Sequenced variable region'] = 'No variable Region'
     # finds the variable regions with the highest probability
     # any region having a percentage of 20% or higher will be listed.
     # The percentage is calculated within the regions.
 
-    return most_probable_V, unpaired, no_V
-
 
-def create_output(path, file_name, unaligned_count, dir_name,
-                  dir_path, temp_path, all_reads, mode):
+def create_row(path, file_name, dir_name, dir_path, temp_path, mode):
     file_name, dir_name, dir_path = directory_navi(file_name, path,
                                                    dir_name, dir_path)
     if mode == 'paired':
-        file_name = file_name.replace('_R1_001', '')
+        new_row['Read-file'] = file_name.replace('_R1_001', '')
     new_file = f'{dir_path}/{dir_name}.csv'
     # The new file is named after the directory containing the reads
-    most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
-                                                   mode, all_reads)
-    if os.path.exists(new_file):
-        header = True
-    else:
-        header = False
-    # The header will only be written if it wasnt written before
-    foo = {'file_name': file_name, 'all_reads': all_reads,
-           'unaligned_count': unaligned_count, 'unpaired': unpaired,
-           'no_V': no_V, 'most_probable_V': most_probable_V}
-    foo.update(regions)
-    with open(new_file, 'a', newline='') as o:
-        fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]',
-                      'Not properly paired', 'Sequenced variable region', 'V1',
-                      'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
-                      'Not aligned to a variable region']
-        writer = csv.DictWriter(o, fieldnames=fieldnames)
-        if header is False:
-            writer.writeheader()
-        for key in foo:
-            if isinstance(foo[key], float):
-                foo[key] = round(foo[key], 2)
-        writer.writerow({'Read-file': foo['file_name'], 'Number of Reads':
-                         foo['all_reads'], 'Unaligned Reads [%]':
-                         foo['unaligned_count'], 'Not properly paired':
-                         foo['unpaired'], 'Sequenced variable region':
-                         foo['most_probable_V'], 'V1': foo['V1'],
-                         'V2': foo['V2'], 'V3': foo['V3'],
-                         'V4': foo['V4'], 'V5': foo['V5'],
-                         'V6': foo['V6'], 'V7': foo['V7'],
-                         'V8': foo['V8'], 'V9': foo['V9'],
-                         'Not aligned to a variable region': foo['no_V']})
-
-
-def print_output(unaligned_count, temp_path, mode):
-    most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
-                                                   mode, all_reads='')
-    print(f'\n{str(unaligned_count)}% were unaligned.')
-    print(f'Of all the aligned Reads most were aligned to: {most_probable_V}')
+    region_count(temp_path, mode)
+    for column in new_row:
+        if isinstance(new_row.at[0, column], float):
+            new_row.at[0, column] = round(new_row.at[0, column], 2)
+    return new_file
+
+
+def print_output(temp_path, mode):
+    region_count(temp_path, mode)
+    print(f"\n{round(new_row.iat[0, 2], 2)}% were unaligned.")
+    print(f"Of all the aligned Reads most were aligned to: \
+          {new_row.iat[0, 4]}")
     print('The probabilities of all regions is as follows [%]:')
-    print('\n'.join(':'.join((key, str(round(val, 2))))
-          for (key, val) in regions.items()))
-    print(f'\nAnd {round(no_V, 2)}% were not mapped to any variable region\n')
+    print('\n'.join(':'.join((column, str(round(new_row.at[0, column], 2))))
+          for column in new_row.columns[5:14]))
+    print(f"\nAnd {round(new_row.iat[0, -1], 2)}% were not mapped \
+          to any variable region\n")
 
 
 def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
@@ -130,20 +106,22 @@ def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
     with open(BED_path, 'r') as bed:
         for line in bed:
             line_list = line.split('\t')
-            regions[line_list[-1].strip('\n')] += 1
+            new_row[line_list[-1].strip('\n')] += 1
             # counts all appearing variable Regions
     with open(Log_path, 'r') as log:
         lines = log.readlines()
-        all_reads = int(lines[0].strip('\n\t ').split()[0])
+        new_row['Number of Reads'] = int(lines[0].strip('\n\t ').split()[0])
         unaligned_count = lines[-1].strip('\n\t ')
         # Reads the bowtie2 stdout which is normally seen in the terminal
     unaligned_count = unaligned_count.split()[0].replace('%', '')
-    unaligned_count = 100 - float(unaligned_count)
+    new_row['Unaligned Reads [%]'] = 100 - float(unaligned_count)
     if file_type is None:
-        print_output(unaligned_count, temp_path, mode)
+        print_output(temp_path, mode)
         # should file_type be None only a single file was given
         # therefore output will be printed in the terminal rather
         # than in a file
     else:
-        create_output(path, file_name, unaligned_count,
-                      dir_name, dir_path, temp_path, all_reads, mode)
+        new_file = create_row(path, file_name, dir_name, dir_path,
+                              temp_path, mode)
+    if file_type is not None:
+        return new_row, new_file
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ dependencies: @@
       - flake8
       - nose
       - coverage >= 6  # to ensure lcov option is available
+      - pandas