diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..4edd7b1 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +relative_files = True diff --git a/.github/workflows/github_tests.yml b/.github/workflows/github_tests.yml index 28aeb59..d29799a 100644 --- a/.github/workflows/github_tests.yml +++ b/.github/workflows/github_tests.yml @@ -9,7 +9,7 @@ on: jobs: build-linux: runs-on: ubuntu-latest - + steps: - name: Checkout Repo uses: actions/checkout@v2 @@ -23,8 +23,13 @@ jobs: run: | $CONDA/bin/flake8 vxdetector/ - name: run python tests - run: nosetests vxdetector --with-coverage --with-doctest - - name: Coveralls + run: | + $CONDA/bin/nosetests vxdetector --with-doctest --with-coverage + - name: convert coverage + run: | + $CONDA/bin/coverage lcov + - name: send coverage report uses: coverallsapp/github-action@master with: github-token: ${{ secrets.GITHUB_TOKEN }} + path-to-lcov: "coverage.lcov" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6bc33eb --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.coverage +coverage.lcov diff --git a/environment.yml b/environment.yml index 8ded2b2..32ec06c 100644 --- a/environment.yml +++ b/environment.yml @@ -6,4 +6,4 @@ channels: dependencies: - flake8 - nose - - coverage + - coverage >= 6 # to ensure lcov option is available diff --git a/vxdetector/Output_counter.py b/vxdetector/Output_counter.py index 5fc2015..2fc4aef 100644 --- a/vxdetector/Output_counter.py +++ b/vxdetector/Output_counter.py @@ -18,24 +18,28 @@ def directory_navi(file_name, path, dir_name, dir_path): return file_name, dir_name, dir_path - -def create_output(path, file_name, unaligned_count, most_probable_V, probability, dir_name, dir_path): - file_name, dir_name, dir_path = directory_navi(file_name, path, dir_name, dir_path) +def create_output(path, file_name, unaligned_count, most_probable_V, + probability, dir_name, dir_path): + file_name, dir_name, dir_path = directory_navi(file_name, path, + dir_name, dir_path) unaligned_count = unaligned_count.split(' ')[1] unaligned_count = float(unaligned_count.strip('()%')) - new_file = dir_path + dir_name +'.csv' + new_file = dir_path + dir_name + '.csv' if os.path.exists(new_file): header = True else: header = False with open(new_file, 'a', newline='') as o: - fieldnames = ['Read-file', 'Unaligned Reads [%]', 'Sequenced variable region', 'Probability [%]'] + fieldnames = ['Read-file', 'Unaligned Reads [%]', + 'Sequenced variable region', 'Probability [%]'] writer = csv.DictWriter(o, fieldnames=fieldnames) - if header == False: + if header is False: writer.writeheader() - writer.writerow({'Read-file' : file_name, 'Unaligned Reads [%]' : unaligned_count, 'Sequenced variable region' : most_probable_V, 'Probability [%]' : probability}) - - + writer.writerow({ + 'Read-file': file_name, + 'Unaligned Reads [%]': unaligned_count, + 'Sequenced variable region': most_probable_V, + 'Probability [%]': probability}) def count(count, temp_path, file_name, file_type, path, dir_name, dir_path): @@ -49,8 +53,10 @@ def count(count, temp_path, file_name, file_type, path, dir_name, dir_path): dictionary[line_list[-1]] = 1 else: dictionary[line_list[-1]] += 1 - #counts all appearing variable Regions - dictionary = sorted(dictionary.items(), key=lambda x:x[1], reverse=True) #sorts the Dictionary with all variable regions for extracting the most probable one + # counts all appearing variable Regions + # sorts the Dictionary with all variable regions for extracting the most + # probable one + dictionary = sorted(dictionary.items(), key=lambda x: x[1], reverse=True) most_probable_V = dictionary[0] most_probable_V_count = most_probable_V[1] most_probable_V = most_probable_V[0].strip('\n') @@ -58,13 +64,12 @@ def count(count, temp_path, file_name, file_type, path, dir_name, dir_path): with open(Log_path, 'r') as log: lines = log.readlines() unaligned_count = lines[2].strip('\n\t ') - if file_type == None: - print( file_name + ': ') + if file_type is None: + print(file_name + ': ') print(unaligned_count) - print('The sequenced variable Region in this file is ' + most_probable_V + ' with a probability of ' + str(probability) + '%.') + print('The sequenced variable Region in this file is ' + + most_probable_V + ' with a probability of ' + + str(probability) + '%.') else: - create_output(path, file_name, unaligned_count, most_probable_V, probability, dir_name, dir_path) - - - - + create_output(path, file_name, unaligned_count, most_probable_V, + probability, dir_name, dir_path) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 3adbbdb..0b41e25 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -8,29 +8,47 @@ import Output_counter import files_manager -def workflow(path, temp_path, file_path, file_type, file_name, dir_name, dir_path): + +def workflow(path, temp_path, file_path, file_type, file_name, dir_name, + dir_path): interact_bowtie2.buildbowtie2(path) - if file_type != None: - interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type) #The Programm bowtie2 is used to align the Reads to a reference 16S database. + if file_type is not None: + # The Programm bowtie2 is used to align the Reads to a reference + # 16S database. + interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type) else: - interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type=' -q') - count = interact_samtools.SAMtoBAM(path, temp_path) #convert the .sam file from bowtie2 to .bam for compability - interact_bedtools.overlap(path, temp_path) #look which reads intersect with which variable Region - Output_counter.count(count, temp_path, file_name, file_type, path, dir_name, dir_path) #counts the Variable Regions that are found with bedtools and prints the highest probable variable Region + interact_bowtie2.mapbowtie2(file_path, path, temp_path, + file_type=' -q') + # convert the .sam file from bowtie2 to .bam for compability + count = interact_samtools.SAMtoBAM(path, temp_path) + # look which reads intersect with which variable Region + interact_bedtools.overlap(path, temp_path) + # counts the Variable Regions that are found with bedtools and prints the + # highest probable variable Region + Output_counter.count(count, temp_path, file_name, file_type, path, + dir_name, dir_path) def main(): - parser = argparse.ArgumentParser(prog= 'VX detector', description= 'This programm tries to find which variable region of the 16S sequence was sequencend') - parser.add_argument('-d', '--directory', dest='dir_path', help='Directory path of the directory containing multiple fastq or fasta files.' ) - parser.add_argument('-sf', '--single-file', dest='fasta_file', default=None,help= 'Filepath of the fastq file containing the sequencences of which the variable regions are to be verified.') - #/homes/jgroos/Downloads/study_raw_data_14513_062122-034504/per_sample_FASTQ/147774/5001_S229_L001_R1_001.fastq + parser = argparse.ArgumentParser(prog='VX detector', description=( + 'This programm tries to find which variable region of the 16S ' + 'sequence was sequencend')) + parser.add_argument('-d', '--directory', dest='dir_path', + help=('Directory path of the directory containing ' + 'multiple fastq or fasta files.')) + parser.add_argument('-sf', '--single-file', dest='fasta_file', + default=None, + help=('Filepath of the fastq file containing the ' + 'sequencences of which the variable regions ' + 'are to be verified.')) + # /homes/jgroos/Downloads/study_raw_data_14513_062122-034504/per_sample_FASTQ/147774/5001_S229_L001_R1_001.fastq args = parser.parse_args() path = files_manager.get_lib() temp_path = files_manager.tmp_dir(path, temp_path='') file_type = None fasta_ext = ('.fasta', '.fa', '.ffa', '.ffn', '.faa', '.frn') fastq_ext = ('.fq', '.fastq',) - if args.fasta_file == None: + if args.fasta_file is None: for root, dirs, files in os.walk(args.dir_path, topdown=True): for file in files: if '_R2_' in file: @@ -47,14 +65,17 @@ def main(): file_type = ' -f' else: continue - workflow(path, temp_path, file_path, file_type, file_name, dir_name, args.dir_path) - if file_type == None: - print('There were no FASTA or FASTQ files with 16S sequences in this directory') + workflow(path, temp_path, file_path, file_type, file_name, + dir_name, args.dir_path) + if file_type is None: + print('There were no FASTA or FASTQ files with 16S sequences ' + 'in this directory') else: - workflow(path, temp_path, args.fasta_file, file_type, file_name='Your file', dir_name='', dir_path='') - #quit() #keeps the tmp folder for troubleshooting + workflow(path, temp_path, args.fasta_file, file_type, + file_name='Your file', dir_name='', dir_path='') + # quit() #keeps the tmp folder for troubleshooting files_manager.tmp_dir(path, temp_path) -if __name__ == '__main__': - main() +if __name__ == '__main__': + main() diff --git a/vxdetector/files_manager.py b/vxdetector/files_manager.py index a78b157..c402d6b 100644 --- a/vxdetector/files_manager.py +++ b/vxdetector/files_manager.py @@ -4,16 +4,19 @@ import tempfile import shutil -def tmp_dir(path, temp_path): #erstellt tmp verzeichnis; sollte es auch deleten + +def tmp_dir(path, temp_path): + """erstellt tmp verzeichnis; sollte es auch deleten""" if os.path.exists(temp_path): shutil.rmtree(temp_path) else: - temp_path = tempfile.mkdtemp(suffix=None, prefix='tmp_files_', dir=path) + '/' - - + temp_path = tempfile.mkdtemp(suffix=None, prefix='tmp_files_', + dir=path) + '/' return temp_path -def get_lib(): #findet die richtigen ordner/dateien, damit das programm funktioniert + +def get_lib(): + """findet die richtigen ordner/dateien, damit das programm funktioniert""" programm_path = os.path.dirname(__file__) + '/' if os.path.exists(programm_path+'Output/'): pass @@ -23,17 +26,19 @@ def get_lib(): #findet die richtigen ordner/dateien, damit das programm funktion if os.path.exists(programm_path+'Indexed_bt2/annoted_ref.bed'): pass else: - print('ERROR: It seems the 16S variable region boundary reference file is missing.') + print('ERROR: It seems the 16S variable region boundary reference ' + 'file is missing.') if os.path.exists(programm_path+'Indexed_bt2/85_otus.fasta'): pass else: - print('ERROR: It seems the Greengenes "85_otus.fasta" file is missing.') + print('ERROR: It seems the Greengenes "85_otus.fasta" file is' + ' missing.') if os.path.exists(programm_path+'Indexed_bt2/85_otus_aligned.fasta'): pass else: - print('ERROR: It seems the Greengenes "85_otus_aligned.fasta" file is missing.') + print('ERROR: It seems the Greengenes "85_otus_aligned.fasta" ' + 'file is missing.') else: print('ERROR: It seems the "Indexed_bt2" directory is missing.') - + return programm_path - diff --git a/vxdetector/interact_bedtools.py b/vxdetector/interact_bedtools.py index 29a64e6..0d77a91 100644 --- a/vxdetector/interact_bedtools.py +++ b/vxdetector/interact_bedtools.py @@ -2,11 +2,15 @@ import os + def overlap(path, temp_path): - bedtools_filepath = '/usr/bin/bedtools' - BAM_filepath = temp_path + 'bam_sorted.bam' - S_ref = path + 'Indexed_bt2/annoted_ref.bed' - #reference "genome" Created by using a aligned greengenes databank and deleting all "-" while keeping track where the variable Regions are - BED_filepath = temp_path + 'BED.bed' - cmd = bedtools_filepath + ' intersect -f 0.5 -wb -a ' + BAM_filepath + ' -b ' + S_ref + ' -bed > ' + BED_filepath #Intersects the .bam file with the reference - os.system(cmd) + bedtools_filepath = '/usr/bin/bedtools' + BAM_filepath = temp_path + 'bam_sorted.bam' + S_ref = path + 'Indexed_bt2/annoted_ref.bed' + # reference "genome" Created by using a aligned greengenes databank and + # deleting all "-" while keeping track where the variable Regions are + BED_filepath = temp_path + 'BED.bed' + # Intersects the .bam file with the reference + cmd = bedtools_filepath + ' intersect -f 0.5 -wb -a ' + BAM_filepath + \ + ' -b ' + S_ref + ' -bed > ' + BED_filepath + os.system(cmd) diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py index ff22552..b00868d 100644 --- a/vxdetector/interact_bowtie2.py +++ b/vxdetector/interact_bowtie2.py @@ -3,15 +3,20 @@ import os from os.path import exists + def buildbowtie2(path): - bowtie2_path = '/vol/software/bin/bowtie2-build' - input_ref = path + 'Indexed_bt2/85_otus.fasta' #reference genome greengenes is used - output_path = path + 'Indexed_bt2/bowtie2' - if exists(output_path + '.1.bt2'): - pass #only builds an index for the alignment if there isn't already one - else: - cmd = bowtie2_path + ' -f ' + input_ref + ' ' + output_path - os.system(cmd) #if a indexfile is missing a new index is build + bowtie2_path = '/vol/software/bin/bowtie2-build' + # reference genome greengenes is used + input_ref = path + 'Indexed_bt2/85_otus.fasta' + output_path = path + 'Indexed_bt2/bowtie2' + if exists(output_path + '.1.bt2'): + # only builds an index for the alignment if there isn't already one + pass + else: + cmd = bowtie2_path + ' -f ' + input_ref + ' ' + output_path + # if a indexfile is missing a new index is build + os.system(cmd) + def mapbowtie2(fasta_file, path, temp_path, file_type): bowtie2_path = '/vol/software/bin/bowtie2' @@ -19,6 +24,7 @@ def mapbowtie2(fasta_file, path, temp_path, file_type): index_path = path + 'Indexed_bt2/bowtie2' log_path = temp_path + 'bowtie2.log' BAM_path = temp_path + 'BAM.bam' - cmd = bowtie2_path + ' -x ' + index_path + file_type + ' -U ' + fasta_file + ' --fast 2> ' + log_path + ' | ' + samtools_path + ' view -b -S -F 4 -o ' + BAM_path + cmd = bowtie2_path + ' -x ' + index_path + file_type + ' -U ' \ + + fasta_file + ' --fast 2> ' + log_path + ' | ' + samtools_path \ + + ' view -b -S -F 4 -o ' + BAM_path os.system(cmd) - \ No newline at end of file diff --git a/vxdetector/interact_samtools.py b/vxdetector/interact_samtools.py index fc9b774..4ca9478 100644 --- a/vxdetector/interact_samtools.py +++ b/vxdetector/interact_samtools.py @@ -3,21 +3,26 @@ import os import subprocess + def SAMtoBAM(path, temp_path): samtools_filepath = '/usr/bin/samtools' BAM_filepath = temp_path + 'BAM.bam' BAM_sorted_filepath = temp_path + 'bam_sorted.bam' - cmd = samtools_filepath + ' sort ' + BAM_filepath + ' -o ' + BAM_sorted_filepath #sorts the .bam file + # sorts the .bam file + cmd = samtools_filepath + ' sort ' + BAM_filepath + ' -o ' \ + + BAM_sorted_filepath os.system(cmd) - cmd = [samtools_filepath,'view','-c',BAM_sorted_filepath] + cmd = [samtools_filepath, 'view', '-c', BAM_sorted_filepath] count = subprocess.run(cmd, stdout=subprocess.PIPE) count = int(count.stdout.decode('utf-8')) - cmd = samtools_filepath + ' index ' + BAM_sorted_filepath #indexes the .bam file. + # indexes the .bam file. + cmd = samtools_filepath + ' index ' + BAM_sorted_filepath os.system(cmd) - + return count -""" + +""" def count(path, temp_path): samtools_filepath = '/usr/bin/samtools' BED_filepath = temp_path + 'BED.bed' @@ -25,12 +30,14 @@ def count(path, temp_path): BAM_filepath = temp_path + 'bam_sorted.bam' pileup_path = temp_path + 'pileup.txt' pileup_test = temp_path + 'test.txt' - cmd = samtools_filepath + ' mpileup ' + BAM_filepath + ' -l ' + S_ref + ' -o ' + pileup_path + ' 2> /dev/null' + cmd = samtools_filepath + ' mpileup ' + BAM_filepath + ' -l ' + S_ref + \ + ' -o ' + pileup_path + ' 2> /dev/null' os.system(cmd) with open(pileup_path, 'r') as pu: for line in pu: line_list = line.split('\t') with open(pileup_test, 'a') as t: - words = line_list[0] + '\t' + line_list[1] + '\t' + line_list[3] + '\n' + words = line_list[0] + '\t' + line_list[1] + '\t' + \ + line_list[3] + '\n' t.write(words) """