jlab · JGroos-16 · Jun 29, 2022 · Jun 29, 2022 · Jun 30, 2022 · Jul 5, 2022
diff --git a/Indexed_bt2/annoted_ref.bed b/Indexed_bt2/annoted_ref.bed
diff --git a/Indexed_bt2/fail.bed b/Indexed_bt2/fail.bed
diff --git a/README.md b/README.md
@@ -20,12 +20,11 @@ Requirements:
     Module: os
     Module: subprocess
     Module: argparse
+    Module: itertools
     Module: tempfile
     Module: shutil
 
     To Do List:
         -Currently the used programms are refrenced by hardcoded paths
             -> not usable on PCs where these programms are saved somewhere else
-        -Currently only the single most probable region is shown. 
-            -> Modify in a way that reads spanning multiple regions are shown correctly
         -Adding other functions such as primer verification or making the programm more efficiant
diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py
@@ -2,13 +2,17 @@
 
 import os
 import csv
+from itertools import (takewhile, repeat)
 
 
 def directory_navi(file_name, path, dir_name, dir_path):
     dir_path = dir_name.replace(dir_path, '', 1)
-    dir_name = dir_path.split('/')[-1]
+    if dir_name.split('/')[-1] == '':
+        dir_name = dir_name.split('/')[-2]
+    else:
+        dir_name = dir_name.split('/')[-1]
     dir_path = dir_path.replace(dir_name, '')
-    dir_path = path+'Output/'+dir_path
+    dir_path = f'{path}Output/{dir_path}'
     os.makedirs(dir_path, exist_ok=True)
     if file_name.endswith('.gz'):
         file_name = file_name.rsplit('.', 2)[0]
@@ -18,58 +22,120 @@ def directory_navi(file_name, path, dir_name, dir_path):
     return file_name, dir_name, dir_path
 
 
-def create_output(path, file_name, unaligned_count, most_probable_V,
-                  probability, dir_name, dir_path):
+def rawincount(filename):
+    f = open(filename, 'rb')
+    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024)
+                                     for _ in repeat(None)))
+
+    return sum(buf.count(b'\n') for buf in bufgen)
+
+
+def region_count(dictionary, unaligned_count, temp_path, all_reads, mode):
+    if temp_path is not None and mode == 'paired':
+        unpaired = round((rawincount(f'{temp_path}bed.log')/all_reads), 2)
+    elif mode == 'unpaired':
+        unpaired = 'unpaired Reads'
+    no_V = rawincount(f'{temp_path}noOver.bed')
+    aligned_count = 100 - unaligned_count
+    count = sum([dictionary['V1'], dictionary['V2'], dictionary['V3'],
+                dictionary['V4'], dictionary['V5'], dictionary['V6'],
+                dictionary['V7'], dictionary['V8'], dictionary['V9'], no_V])
+    dictionary['V1'] = round((dictionary['V1'] / count) * aligned_count, 2)
+    dictionary['V2'] = round((dictionary['V2'] / count) * aligned_count, 2)
+    dictionary['V3'] = round((dictionary['V3'] / count) * aligned_count, 2)
+    dictionary['V4'] = round((dictionary['V4'] / count) * aligned_count, 2)
+    dictionary['V5'] = round((dictionary['V5'] / count) * aligned_count, 2)
+    dictionary['V6'] = round((dictionary['V6'] / count) * aligned_count, 2)
+    dictionary['V7'] = round((dictionary['V7'] / count) * aligned_count, 2)
+    dictionary['V8'] = round((dictionary['V8'] / count) * aligned_count, 2)
+    dictionary['V9'] = round((dictionary['V9'] / count) * aligned_count, 2)
+    no_V = round((no_V/count)*aligned_count, 2)
+    if aligned_count == 0 or (no_V/aligned_count) == 1:
+        most_probable_V = 'No variable Region'
+    else:
+        most_probable_V = [x[0].replace('V', '') for x in
+                           sorted(list(dictionary.items()))
+                           if ((x[1] / aligned_count) * 100) > 20]
+        most_probable_V = 'V' + ''.join(most_probable_V)
+        if most_probable_V == 'V':
+            most_probable_V = 'No variable Region'
+
+    return most_probable_V, dictionary, unpaired, no_V
+
+
+def create_output(path, file_name, unaligned_count, dictionary, dir_name,
+                  dir_path, temp_path, all_reads, mode):
     file_name, dir_name, dir_path = directory_navi(file_name, path,
                                                    dir_name, dir_path)
-    unaligned_count = unaligned_count.split(' ')[1]
-    unaligned_count = float(unaligned_count.strip('()%'))
-    new_file = dir_path + dir_name + '.csv'
+    if mode == 'paired':
+        file_name = file_name.replace('_R1_001', '')
+    new_file = f'{dir_path}{dir_name}.csv'
+    most_probable_V, dictionary, unpaired, no_V = region_count(dictionary,
+                                                               unaligned_count,
+                                                               temp_path,
+                                                               all_reads, mode)
     if os.path.exists(new_file):
         header = True
     else:
         header = False
     with open(new_file, 'a', newline='') as o:
-        fieldnames = ['Read-file', 'Unaligned Reads [%]',
-                      'Sequenced variable region', 'Probability [%]']
+        fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]',
+                      'Not properly paired', 'Sequenced variable region', 'V1',
+                      'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
+                      'Not aligned to a variable region']
         writer = csv.DictWriter(o, fieldnames=fieldnames)
         if header is False:
             writer.writeheader()
-        writer.writerow({
-            'Read-file': file_name,
-            'Unaligned Reads [%]': unaligned_count,
-            'Sequenced variable region': most_probable_V,
-            'Probability [%]': probability})
+        writer.writerow({'Read-file': file_name, 'Number of Reads': all_reads,
+                         'Unaligned Reads [%]': unaligned_count,
+                         'Not properly paired': unpaired,
+                         'Sequenced variable region': most_probable_V,
+                         'V1': dictionary['V1'], 'V2': dictionary['V2'],
+                         'V3': dictionary['V3'], 'V4': dictionary['V4'],
+                         'V5': dictionary['V5'], 'V6': dictionary['V6'],
+                         'V7': dictionary['V7'], 'V8': dictionary['V8'],
+                         'V9': dictionary['V9'],
+                         'Not aligned to a variable region': no_V})
+
+
+def print_output(dictionary, unaligned_count):
+    most_probable_V, dictionary = region_count(dictionary, unaligned_count,
+                                               temp_path=None, all_reads='',
+                                               mode='')
+    V1 = str(dictionary['V1'])
+    V2 = str(dictionary['V2'])
+    V3 = str(dictionary['V3'])
+    V4 = str(dictionary['V4'])
+    V5 = str(dictionary['V5'])
+    V6 = str(dictionary['V6'])
+    V7 = str(dictionary['V7'])
+    V8 = str(dictionary['V8'])
+    V9 = str(dictionary['V9'])
+    print(f'\n{str(unaligned_count)}% were unaligned.')
+    print(f'Of all the aligned Reads most were aligned to: {most_probable_V}')
+    print('The probabilities of all regions is as follows [%]:')
+    print(f'V1: {V1}\nV2: {V2}\nV3: {V3}\nV4: {V4}\nV5: {V5}\n \
+          V6: {V6}\nV7: {V7}\nV8: {V8}\nV9: {V9}\n')
 
 
-def count(count, temp_path, file_name, file_type, path, dir_name, dir_path):
-    dictionary = {}
-    BED_path = temp_path + 'BED.bed'
-    Log_path = temp_path + 'bowtie2.log'
+def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
+    dictionary = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0,
+                  'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0}
+    BED_path = f'{temp_path}BED.bed'
+    Log_path = f'{temp_path}bowtie2.log'
     with open(BED_path, 'r') as bed:
         for line in bed:
             line_list = line.split('\t')
-            if line_list[-1] not in dictionary:
-                dictionary[line_list[-1]] = 1
-            else:
-                dictionary[line_list[-1]] += 1
-                # counts all appearing variable Regions
-    # sorts the Dictionary with all variable regions for extracting the most
-    # probable one
-    dictionary = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
-    most_probable_V = dictionary[0]
-    most_probable_V_count = most_probable_V[1]
-    most_probable_V = most_probable_V[0].strip('\n')
-    probability = round((most_probable_V_count/count)*100, 2)
+            dictionary[line_list[-1].strip('\n')] += 1
+            # counts all appearing variable Regions
     with open(Log_path, 'r') as log:
         lines = log.readlines()
-        unaligned_count = lines[2].strip('\n\t ')
+        all_reads = int(lines[0].strip('\n\t ').split()[0])
+        unaligned_count = lines[-1].strip('\n\t ')
+    unaligned_count = unaligned_count.split()[0].replace('%', '')
+    unaligned_count = round((100 - float(unaligned_count)), 2)
     if file_type is None:
-        print(file_name + ': ')
-        print(unaligned_count)
-        print('The sequenced variable Region in this file is '
-              + most_probable_V + ' with a probability of '
-              + str(probability) + '%.')
+        print_output(dictionary, unaligned_count)
     else:
-        create_output(path, file_name, unaligned_count, most_probable_V,
-                      probability, dir_name, dir_path)
+        create_output(path, file_name, unaligned_count, dictionary,
+                      dir_name, dir_path, temp_path, all_reads, mode)
diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
@@ -1,32 +1,32 @@
 #!/usr/bin/python
 
 import argparse
+import os
 import interact_bowtie2
-import interact_samtools
 import interact_bedtools
-import os
 import Output_counter
 import files_manager
 
 
 def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
-             dir_path):
+             dir_path, mode, read2_file):
     interact_bowtie2.buildbowtie2(path)
     if file_type is not None:
         # The Programm bowtie2 is used to align the Reads to a reference
         # 16S database.
-        interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type)
+        aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file,
+                                                   path, temp_path, mode,
+                                                   file_type)
     else:
-        interact_bowtie2.mapbowtie2(file_path, path, temp_path,
-                                    file_type=' -q')
-    # convert the .sam file from bowtie2 to .bam for compability
-    count = interact_samtools.SAMtoBAM(path, temp_path)
+        aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file,
+                                                   path, temp_path, mode,
+                                                   file_type=' -q')
     # look which reads intersect with which variable Region
-    interact_bedtools.overlap(path, temp_path)
+    interact_bedtools.overlap(path, temp_path, aligned_path)
     # counts the Variable Regions that are found with bedtools and prints the
     # highest probable variable Region
-    Output_counter.count(count, temp_path, file_name, file_type, path,
-                         dir_name, dir_path)
+    Output_counter.count(temp_path, file_name, file_type, path,
+                         dir_name, dir_path, mode)
 
 
 def main():
@@ -41,7 +41,6 @@ def main():
                         help=('Filepath of the fastq file containing the '
                               'sequencences of which the variable regions '
                               'are to be verified.'))
-    # /homes/jgroos/Downloads/study_raw_data_14513_062122-034504/per_sample_FASTQ/147774/5001_S229_L001_R1_001.fastq
     args = parser.parse_args()
     path = files_manager.get_lib()
     temp_path = files_manager.tmp_dir(path, temp_path='')
@@ -51,28 +50,41 @@ def main():
     if args.fasta_file is None:
         for root, dirs, files in os.walk(args.dir_path, topdown=True):
             for file in files:
+                mode = 'unpaired'
+                read2_file = ''
                 if '_R2_' in file:
                     continue
                 if any(elements in file for elements in fastq_ext):
                     file_name = file
+                    read2_file = os.path.join(root, file.replace('_R1_',
+                                                                 '_R2_'))
+                    rev_exists = os.path.exists(read2_file)
+                    if '_R1_' in file_name and rev_exists is True:
+                        mode = 'paired'
                     dir_name = root
                     file_path = os.path.join(root, file)
                     file_type = ' -q'
                 elif any(elements in file for elements in fasta_ext):
                     file_name = file
+                    read2_file = os.path.join(root, file.replace('_R1_',
+                                                                 '_R2_'))
+                    rev_exists = os.path.exists(read2_file)
+                    if '_R1_' in file_name and rev_exists is True:
+                        mode = 'paired'
                     dir_name = root
                     file_path = os.path.join(root, file)
                     file_type = ' -f'
                 else:
                     continue
                 workflow(path, temp_path, file_path, file_type, file_name,
-                         dir_name, args.dir_path)
+                         dir_name, args.dir_path, mode, read2_file)
         if file_type is None:
             print('There were no FASTA or FASTQ files with 16S sequences '
                   'in this directory')
     else:
         workflow(path, temp_path, args.fasta_file, file_type,
-                 file_name='Your file', dir_name='', dir_path='')
+                 file_name='Your file', dir_name='', dir_path='',
+                 mode='unpaired', read2_file='')
     # quit() #keeps the tmp folder for troubleshooting
     files_manager.tmp_dir(path, temp_path)
 

diff --git a/vxdetector/files_manager.py b/vxdetector/files_manager.py
@@ -6,34 +6,46 @@
 
 
 def tmp_dir(path, temp_path):
-    """erstellt tmp verzeichnis; sollte es auch deleten"""
+    # erstellt tmp verzeichnis, fals es schon existiert
+    # wird es geloescht
     if os.path.exists(temp_path):
         shutil.rmtree(temp_path)
     else:
-        temp_path = tempfile.mkdtemp(suffix=None, prefix='tmp_files_',
-                                     dir=path) + '/'
+        temp_path = f"{tempfile.mkdtemp(prefix='tmp_files_', dir=path)}/"
+
     return temp_path
 
 
 def get_lib():
-    """findet die richtigen ordner/dateien, damit das programm funktioniert"""
-    programm_path = os.path.dirname(__file__) + '/'
-    if os.path.exists(programm_path+'Output/'):
+    # findet die richtigen ordner/dateien, damit das programm funktioniert
+    programm_path = f'{os.path.dirname(__file__)}'
+    programm_path = f"{programm_path.rsplit('/', 1)[0]}/"
+    if os.path.exists(f'{programm_path}Output/'):
         pass
     else:
-        os.mkdir(programm_path+'Output/')
-    if os.path.exists(programm_path+'Indexed_bt2/'):
-        if os.path.exists(programm_path+'Indexed_bt2/annoted_ref.bed'):
+        os.mkdir(f'{programm_path}Output/')
+    if os.path.exists(f'{programm_path}Indexed_bt2/'):
+        if os.path.exists(f'{programm_path}Indexed_bt2/annoted_ref.bed'):
+            pass
+        else:
+            print('ERROR: It seems the 16S variable region boundary'
+                  ' reference file is missing.')
+        if os.path.exists(f'{programm_path}Indexed_bt2/85_otus.fasta'):
+            pass
+        else:
+            print('ERROR: It seems the Greengenes "85_otus.fasta"'
+                  'file is missing.')
+        if os.path.exists(f'{programm_path}Indexed_bt2/85_otus_aligned.fasta'):
             pass
         else:
             print('ERROR: It seems the 16S variable region boundary reference '
                   'file is missing.')
-        if os.path.exists(programm_path+'Indexed_bt2/85_otus.fasta'):
+        if os.path.exists(f'{programm_path}Indexed_bt2/85_otus.fasta'):
             pass
         else:
             print('ERROR: It seems the Greengenes "85_otus.fasta" file is'
                   ' missing.')
-        if os.path.exists(programm_path+'Indexed_bt2/85_otus_aligned.fasta'):
+        if os.path.exists(f'{programm_path}Indexed_bt2/85_otus_aligned.fasta'):
             pass
         else:
             print('ERROR: It seems the Greengenes "85_otus_aligned.fasta" '

diff --git a/vxdetector/interact_bedtools.py b/vxdetector/interact_bedtools.py
@@ -3,14 +3,21 @@
 import os
 
 
-def overlap(path, temp_path):
+def no_overlap(path, temp_path, aligned_path):
     bedtools_filepath = '/usr/bin/bedtools'
-    BAM_filepath = temp_path + 'bam_sorted.bam'
-    S_ref = path + 'Indexed_bt2/annoted_ref.bed'
+    S_ref = f'{path}Indexed_bt2/annoted_ref.bed'
+    noOver_filepath = f'{temp_path}noOver.bed'
+    os.system(f'{bedtools_filepath} intersect -v -f 0.5 -a \
+                {aligned_path} -b {S_ref} > {noOver_filepath}')
+
+
+def overlap(path, temp_path, aligned_path):
+    bedtools_filepath = '/usr/bin/bedtools'
+    S_ref = f'{path}Indexed_bt2/annoted_ref.bed'
     # reference "genome" Created by using a aligned greengenes databank and
     # deleting all "-" while keeping track where the variable Regions are
-    BED_filepath = temp_path + 'BED.bed'
-    # Intersects the .bam file with the reference
-    cmd = bedtools_filepath + ' intersect -f 0.5 -wb -a ' + BAM_filepath + \
-        ' -b ' + S_ref + ' -bed > ' + BED_filepath
-    os.system(cmd)
+    BED_filepath = f'{temp_path}BED.bed'
+    # Intersects the aligned bowtie2 Output file with the reference
+    os.system(f'{bedtools_filepath} intersect -f 0.5 -a {S_ref} -b \
+                {aligned_path} > {BED_filepath}')
+    no_overlap(path, temp_path, aligned_path)