Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[run]
relative_files = True
11 changes: 8 additions & 3 deletions .github/workflows/github_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
build-linux:
runs-on: ubuntu-latest

steps:
- name: Checkout Repo
uses: actions/checkout@v2
Expand All @@ -23,8 +23,13 @@ jobs:
run: |
$CONDA/bin/flake8 vxdetector/
- name: run python tests
run: nosetests vxdetector --with-coverage --with-doctest
- name: Coveralls
run: |
$CONDA/bin/nosetests vxdetector --with-doctest --with-coverage
- name: convert coverage
run: |
$CONDA/bin/coverage lcov
- name: send coverage report
uses: coverallsapp/github-action@master
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
path-to-lcov: "coverage.lcov"
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.coverage
coverage.lcov
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ channels:
dependencies:
- flake8
- nose
- coverage
- coverage >= 6 # to ensure lcov option is available
43 changes: 24 additions & 19 deletions vxdetector/Output_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,28 @@ def directory_navi(file_name, path, dir_name, dir_path):
return file_name, dir_name, dir_path



def create_output(path, file_name, unaligned_count, most_probable_V, probability, dir_name, dir_path):
file_name, dir_name, dir_path = directory_navi(file_name, path, dir_name, dir_path)
def create_output(path, file_name, unaligned_count, most_probable_V,
probability, dir_name, dir_path):
file_name, dir_name, dir_path = directory_navi(file_name, path,
dir_name, dir_path)
unaligned_count = unaligned_count.split(' ')[1]
unaligned_count = float(unaligned_count.strip('()%'))
new_file = dir_path + dir_name +'.csv'
new_file = dir_path + dir_name + '.csv'
if os.path.exists(new_file):
header = True
else:
header = False
with open(new_file, 'a', newline='') as o:
fieldnames = ['Read-file', 'Unaligned Reads [%]', 'Sequenced variable region', 'Probability [%]']
fieldnames = ['Read-file', 'Unaligned Reads [%]',
'Sequenced variable region', 'Probability [%]']
writer = csv.DictWriter(o, fieldnames=fieldnames)
if header == False:
if header is False:
writer.writeheader()
writer.writerow({'Read-file' : file_name, 'Unaligned Reads [%]' : unaligned_count, 'Sequenced variable region' : most_probable_V, 'Probability [%]' : probability})


writer.writerow({
'Read-file': file_name,
'Unaligned Reads [%]': unaligned_count,
'Sequenced variable region': most_probable_V,
'Probability [%]': probability})


def count(count, temp_path, file_name, file_type, path, dir_name, dir_path):
Expand All @@ -49,22 +53,23 @@ def count(count, temp_path, file_name, file_type, path, dir_name, dir_path):
dictionary[line_list[-1]] = 1
else:
dictionary[line_list[-1]] += 1
#counts all appearing variable Regions
dictionary = sorted(dictionary.items(), key=lambda x:x[1], reverse=True) #sorts the Dictionary with all variable regions for extracting the most probable one
# counts all appearing variable Regions
# sorts the Dictionary with all variable regions for extracting the most
# probable one
dictionary = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
most_probable_V = dictionary[0]
most_probable_V_count = most_probable_V[1]
most_probable_V = most_probable_V[0].strip('\n')
probability = round((most_probable_V_count/count)*100, 2)
with open(Log_path, 'r') as log:
lines = log.readlines()
unaligned_count = lines[2].strip('\n\t ')
if file_type == None:
print( file_name + ': ')
if file_type is None:
print(file_name + ': ')
print(unaligned_count)
print('The sequenced variable Region in this file is ' + most_probable_V + ' with a probability of ' + str(probability) + '%.')
print('The sequenced variable Region in this file is '
+ most_probable_V + ' with a probability of '
+ str(probability) + '%.')
else:
create_output(path, file_name, unaligned_count, most_probable_V, probability, dir_name, dir_path)




create_output(path, file_name, unaligned_count, most_probable_V,
probability, dir_name, dir_path)
59 changes: 40 additions & 19 deletions vxdetector/VXdetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,47 @@
import Output_counter
import files_manager

def workflow(path, temp_path, file_path, file_type, file_name, dir_name, dir_path):

def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
dir_path):
interact_bowtie2.buildbowtie2(path)
if file_type != None:
interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type) #The Programm bowtie2 is used to align the Reads to a reference 16S database.
if file_type is not None:
# The Programm bowtie2 is used to align the Reads to a reference
# 16S database.
interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type)
else:
interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type=' -q')
count = interact_samtools.SAMtoBAM(path, temp_path) #convert the .sam file from bowtie2 to .bam for compability
interact_bedtools.overlap(path, temp_path) #look which reads intersect with which variable Region
Output_counter.count(count, temp_path, file_name, file_type, path, dir_name, dir_path) #counts the Variable Regions that are found with bedtools and prints the highest probable variable Region
interact_bowtie2.mapbowtie2(file_path, path, temp_path,
file_type=' -q')
# convert the .sam file from bowtie2 to .bam for compability
count = interact_samtools.SAMtoBAM(path, temp_path)
# look which reads intersect with which variable Region
interact_bedtools.overlap(path, temp_path)
# counts the Variable Regions that are found with bedtools and prints the
# highest probable variable Region
Output_counter.count(count, temp_path, file_name, file_type, path,
dir_name, dir_path)


def main():
parser = argparse.ArgumentParser(prog= 'VX detector', description= 'This programm tries to find which variable region of the 16S sequence was sequencend')
parser.add_argument('-d', '--directory', dest='dir_path', help='Directory path of the directory containing multiple fastq or fasta files.' )
parser.add_argument('-sf', '--single-file', dest='fasta_file', default=None,help= 'Filepath of the fastq file containing the sequencences of which the variable regions are to be verified.')
#/homes/jgroos/Downloads/study_raw_data_14513_062122-034504/per_sample_FASTQ/147774/5001_S229_L001_R1_001.fastq
parser = argparse.ArgumentParser(prog='VX detector', description=(
'This programm tries to find which variable region of the 16S '
'sequence was sequencend'))
parser.add_argument('-d', '--directory', dest='dir_path',
help=('Directory path of the directory containing '
'multiple fastq or fasta files.'))
parser.add_argument('-sf', '--single-file', dest='fasta_file',
default=None,
help=('Filepath of the fastq file containing the '
'sequencences of which the variable regions '
'are to be verified.'))
# /homes/jgroos/Downloads/study_raw_data_14513_062122-034504/per_sample_FASTQ/147774/5001_S229_L001_R1_001.fastq
args = parser.parse_args()
path = files_manager.get_lib()
temp_path = files_manager.tmp_dir(path, temp_path='')
file_type = None
fasta_ext = ('.fasta', '.fa', '.ffa', '.ffn', '.faa', '.frn')
fastq_ext = ('.fq', '.fastq',)
if args.fasta_file == None:
if args.fasta_file is None:
for root, dirs, files in os.walk(args.dir_path, topdown=True):
for file in files:
if '_R2_' in file:
Expand All @@ -47,14 +65,17 @@ def main():
file_type = ' -f'
else:
continue
workflow(path, temp_path, file_path, file_type, file_name, dir_name, args.dir_path)
if file_type == None:
print('There were no FASTA or FASTQ files with 16S sequences in this directory')
workflow(path, temp_path, file_path, file_type, file_name,
dir_name, args.dir_path)
if file_type is None:
print('There were no FASTA or FASTQ files with 16S sequences '
'in this directory')
else:
workflow(path, temp_path, args.fasta_file, file_type, file_name='Your file', dir_name='', dir_path='')
#quit() #keeps the tmp folder for troubleshooting
workflow(path, temp_path, args.fasta_file, file_type,
file_name='Your file', dir_name='', dir_path='')
# quit() #keeps the tmp folder for troubleshooting
files_manager.tmp_dir(path, temp_path)

if __name__ == '__main__':
main()

if __name__ == '__main__':
main()
25 changes: 15 additions & 10 deletions vxdetector/files_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
import tempfile
import shutil

def tmp_dir(path, temp_path): #erstellt tmp verzeichnis; sollte es auch deleten

def tmp_dir(path, temp_path):
"""erstellt tmp verzeichnis; sollte es auch deleten"""
if os.path.exists(temp_path):
shutil.rmtree(temp_path)
else:
temp_path = tempfile.mkdtemp(suffix=None, prefix='tmp_files_', dir=path) + '/'


temp_path = tempfile.mkdtemp(suffix=None, prefix='tmp_files_',
dir=path) + '/'
return temp_path

def get_lib(): #findet die richtigen ordner/dateien, damit das programm funktioniert

def get_lib():
"""findet die richtigen ordner/dateien, damit das programm funktioniert"""
programm_path = os.path.dirname(__file__) + '/'
if os.path.exists(programm_path+'Output/'):
pass
Expand All @@ -23,17 +26,19 @@ def get_lib(): #findet die richtigen ordner/dateien, damit das programm funktion
if os.path.exists(programm_path+'Indexed_bt2/annoted_ref.bed'):
pass
else:
print('ERROR: It seems the 16S variable region boundary reference file is missing.')
print('ERROR: It seems the 16S variable region boundary reference '
'file is missing.')
if os.path.exists(programm_path+'Indexed_bt2/85_otus.fasta'):
pass
else:
print('ERROR: It seems the Greengenes "85_otus.fasta" file is missing.')
print('ERROR: It seems the Greengenes "85_otus.fasta" file is'
' missing.')
if os.path.exists(programm_path+'Indexed_bt2/85_otus_aligned.fasta'):
pass
else:
print('ERROR: It seems the Greengenes "85_otus_aligned.fasta" file is missing.')
print('ERROR: It seems the Greengenes "85_otus_aligned.fasta" '
'file is missing.')
else:
print('ERROR: It seems the "Indexed_bt2" directory is missing.')

return programm_path

18 changes: 11 additions & 7 deletions vxdetector/interact_bedtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@

import os


def overlap(path, temp_path):
bedtools_filepath = '/usr/bin/bedtools'
BAM_filepath = temp_path + 'bam_sorted.bam'
S_ref = path + 'Indexed_bt2/annoted_ref.bed'
#reference "genome" Created by using a aligned greengenes databank and deleting all "-" while keeping track where the variable Regions are
BED_filepath = temp_path + 'BED.bed'
cmd = bedtools_filepath + ' intersect -f 0.5 -wb -a ' + BAM_filepath + ' -b ' + S_ref + ' -bed > ' + BED_filepath #Intersects the .bam file with the reference
os.system(cmd)
bedtools_filepath = '/usr/bin/bedtools'
BAM_filepath = temp_path + 'bam_sorted.bam'
S_ref = path + 'Indexed_bt2/annoted_ref.bed'
# reference "genome" Created by using a aligned greengenes databank and
# deleting all "-" while keeping track where the variable Regions are
BED_filepath = temp_path + 'BED.bed'
# Intersects the .bam file with the reference
cmd = bedtools_filepath + ' intersect -f 0.5 -wb -a ' + BAM_filepath + \
' -b ' + S_ref + ' -bed > ' + BED_filepath
os.system(cmd)
26 changes: 16 additions & 10 deletions vxdetector/interact_bowtie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,28 @@
import os
from os.path import exists


def buildbowtie2(path):
bowtie2_path = '/vol/software/bin/bowtie2-build'
input_ref = path + 'Indexed_bt2/85_otus.fasta' #reference genome greengenes is used
output_path = path + 'Indexed_bt2/bowtie2'
if exists(output_path + '.1.bt2'):
pass #only builds an index for the alignment if there isn't already one
else:
cmd = bowtie2_path + ' -f ' + input_ref + ' ' + output_path
os.system(cmd) #if a indexfile is missing a new index is build
bowtie2_path = '/vol/software/bin/bowtie2-build'
# reference genome greengenes is used
input_ref = path + 'Indexed_bt2/85_otus.fasta'
output_path = path + 'Indexed_bt2/bowtie2'
if exists(output_path + '.1.bt2'):
# only builds an index for the alignment if there isn't already one
pass
else:
cmd = bowtie2_path + ' -f ' + input_ref + ' ' + output_path
# if a indexfile is missing a new index is build
os.system(cmd)


def mapbowtie2(fasta_file, path, temp_path, file_type):
bowtie2_path = '/vol/software/bin/bowtie2'
samtools_path = '/usr/bin/samtools'
index_path = path + 'Indexed_bt2/bowtie2'
log_path = temp_path + 'bowtie2.log'
BAM_path = temp_path + 'BAM.bam'
cmd = bowtie2_path + ' -x ' + index_path + file_type + ' -U ' + fasta_file + ' --fast 2> ' + log_path + ' | ' + samtools_path + ' view -b -S -F 4 -o ' + BAM_path
cmd = bowtie2_path + ' -x ' + index_path + file_type + ' -U ' \
+ fasta_file + ' --fast 2> ' + log_path + ' | ' + samtools_path \
+ ' view -b -S -F 4 -o ' + BAM_path
os.system(cmd)

21 changes: 14 additions & 7 deletions vxdetector/interact_samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,41 @@
import os
import subprocess


def SAMtoBAM(path, temp_path):
samtools_filepath = '/usr/bin/samtools'
BAM_filepath = temp_path + 'BAM.bam'
BAM_sorted_filepath = temp_path + 'bam_sorted.bam'
cmd = samtools_filepath + ' sort ' + BAM_filepath + ' -o ' + BAM_sorted_filepath #sorts the .bam file
# sorts the .bam file
cmd = samtools_filepath + ' sort ' + BAM_filepath + ' -o ' \
+ BAM_sorted_filepath
os.system(cmd)
cmd = [samtools_filepath,'view','-c',BAM_sorted_filepath]
cmd = [samtools_filepath, 'view', '-c', BAM_sorted_filepath]
count = subprocess.run(cmd, stdout=subprocess.PIPE)
count = int(count.stdout.decode('utf-8'))
cmd = samtools_filepath + ' index ' + BAM_sorted_filepath #indexes the .bam file.
# indexes the .bam file.
cmd = samtools_filepath + ' index ' + BAM_sorted_filepath
os.system(cmd)

return count

"""

"""
def count(path, temp_path):
samtools_filepath = '/usr/bin/samtools'
BED_filepath = temp_path + 'BED.bed'
S_ref = path + 'Indexed_bt2/annoted_ref.bed'
BAM_filepath = temp_path + 'bam_sorted.bam'
pileup_path = temp_path + 'pileup.txt'
pileup_test = temp_path + 'test.txt'
cmd = samtools_filepath + ' mpileup ' + BAM_filepath + ' -l ' + S_ref + ' -o ' + pileup_path + ' 2> /dev/null'
cmd = samtools_filepath + ' mpileup ' + BAM_filepath + ' -l ' + S_ref + \
' -o ' + pileup_path + ' 2> /dev/null'
os.system(cmd)
with open(pileup_path, 'r') as pu:
for line in pu:
line_list = line.split('\t')
with open(pileup_test, 'a') as t:
words = line_list[0] + '\t' + line_list[1] + '\t' + line_list[3] + '\n'
words = line_list[0] + '\t' + line_list[1] + '\t' + \
line_list[3] + '\n'
t.write(words)
"""