Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10,778 changes: 5,389 additions & 5,389 deletions Indexed_bt2/annoted_ref.bed

Large diffs are not rendered by default.

30,528 changes: 0 additions & 30,528 deletions Indexed_bt2/fail.bed

This file was deleted.

3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@ Requirements:
Module: os
Module: subprocess
Module: argparse
Module: itertools
Module: tempfile
Module: shutil

To Do List:
-Currently the used programms are refrenced by hardcoded paths
-> not usable on PCs where these programms are saved somewhere else
-Currently only the single most probable region is shown.
-> Modify in a way that reads spanning multiple regions are shown correctly
-Adding other functions such as primer verification or making the programm more efficiant
142 changes: 104 additions & 38 deletions vxdetector/Output_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import os
import csv
from itertools import (takewhile, repeat)


def directory_navi(file_name, path, dir_name, dir_path):
dir_path = dir_name.replace(dir_path, '', 1)
dir_name = dir_path.split('/')[-1]
if dir_name.split('/')[-1] == '':
dir_name = dir_name.split('/')[-2]
else:
dir_name = dir_name.split('/')[-1]
dir_path = dir_path.replace(dir_name, '')
dir_path = path+'Output/'+dir_path
dir_path = f'{path}Output/{dir_path}'
os.makedirs(dir_path, exist_ok=True)
if file_name.endswith('.gz'):
file_name = file_name.rsplit('.', 2)[0]
Expand All @@ -18,58 +22,120 @@ def directory_navi(file_name, path, dir_name, dir_path):
return file_name, dir_name, dir_path


def create_output(path, file_name, unaligned_count, most_probable_V,
probability, dir_name, dir_path):
def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024)
for _ in repeat(None)))

return sum(buf.count(b'\n') for buf in bufgen)


def region_count(dictionary, unaligned_count, temp_path, all_reads, mode):
if temp_path is not None and mode == 'paired':
unpaired = round((rawincount(f'{temp_path}bed.log')/all_reads), 2)
elif mode == 'unpaired':
unpaired = 'unpaired Reads'
no_V = rawincount(f'{temp_path}noOver.bed')
aligned_count = 100 - unaligned_count
count = sum([dictionary['V1'], dictionary['V2'], dictionary['V3'],
dictionary['V4'], dictionary['V5'], dictionary['V6'],
dictionary['V7'], dictionary['V8'], dictionary['V9'], no_V])
dictionary['V1'] = round((dictionary['V1'] / count) * aligned_count, 2)
dictionary['V2'] = round((dictionary['V2'] / count) * aligned_count, 2)
dictionary['V3'] = round((dictionary['V3'] / count) * aligned_count, 2)
dictionary['V4'] = round((dictionary['V4'] / count) * aligned_count, 2)
dictionary['V5'] = round((dictionary['V5'] / count) * aligned_count, 2)
dictionary['V6'] = round((dictionary['V6'] / count) * aligned_count, 2)
dictionary['V7'] = round((dictionary['V7'] / count) * aligned_count, 2)
dictionary['V8'] = round((dictionary['V8'] / count) * aligned_count, 2)
dictionary['V9'] = round((dictionary['V9'] / count) * aligned_count, 2)
no_V = round((no_V/count)*aligned_count, 2)
if aligned_count == 0 or (no_V/aligned_count) == 1:
most_probable_V = 'No variable Region'
else:
most_probable_V = [x[0].replace('V', '') for x in
sorted(list(dictionary.items()))
if ((x[1] / aligned_count) * 100) > 20]
most_probable_V = 'V' + ''.join(most_probable_V)
if most_probable_V == 'V':
most_probable_V = 'No variable Region'

return most_probable_V, dictionary, unpaired, no_V


def create_output(path, file_name, unaligned_count, dictionary, dir_name,
dir_path, temp_path, all_reads, mode):
file_name, dir_name, dir_path = directory_navi(file_name, path,
dir_name, dir_path)
unaligned_count = unaligned_count.split(' ')[1]
unaligned_count = float(unaligned_count.strip('()%'))
new_file = dir_path + dir_name + '.csv'
if mode == 'paired':
file_name = file_name.replace('_R1_001', '')
new_file = f'{dir_path}{dir_name}.csv'
most_probable_V, dictionary, unpaired, no_V = region_count(dictionary,
unaligned_count,
temp_path,
all_reads, mode)
if os.path.exists(new_file):
header = True
else:
header = False
with open(new_file, 'a', newline='') as o:
fieldnames = ['Read-file', 'Unaligned Reads [%]',
'Sequenced variable region', 'Probability [%]']
fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]',
'Not properly paired', 'Sequenced variable region', 'V1',
'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
'Not aligned to a variable region']
writer = csv.DictWriter(o, fieldnames=fieldnames)
if header is False:
writer.writeheader()
writer.writerow({
'Read-file': file_name,
'Unaligned Reads [%]': unaligned_count,
'Sequenced variable region': most_probable_V,
'Probability [%]': probability})
writer.writerow({'Read-file': file_name, 'Number of Reads': all_reads,
'Unaligned Reads [%]': unaligned_count,
'Not properly paired': unpaired,
'Sequenced variable region': most_probable_V,
'V1': dictionary['V1'], 'V2': dictionary['V2'],
'V3': dictionary['V3'], 'V4': dictionary['V4'],
'V5': dictionary['V5'], 'V6': dictionary['V6'],
'V7': dictionary['V7'], 'V8': dictionary['V8'],
'V9': dictionary['V9'],
'Not aligned to a variable region': no_V})


def print_output(dictionary, unaligned_count):
most_probable_V, dictionary = region_count(dictionary, unaligned_count,
temp_path=None, all_reads='',
mode='')
V1 = str(dictionary['V1'])
V2 = str(dictionary['V2'])
V3 = str(dictionary['V3'])
V4 = str(dictionary['V4'])
V5 = str(dictionary['V5'])
V6 = str(dictionary['V6'])
V7 = str(dictionary['V7'])
V8 = str(dictionary['V8'])
V9 = str(dictionary['V9'])
print(f'\n{str(unaligned_count)}% were unaligned.')
print(f'Of all the aligned Reads most were aligned to: {most_probable_V}')
print('The probabilities of all regions is as follows [%]:')
print(f'V1: {V1}\nV2: {V2}\nV3: {V3}\nV4: {V4}\nV5: {V5}\n \
V6: {V6}\nV7: {V7}\nV8: {V8}\nV9: {V9}\n')


def count(count, temp_path, file_name, file_type, path, dir_name, dir_path):
dictionary = {}
BED_path = temp_path + 'BED.bed'
Log_path = temp_path + 'bowtie2.log'
def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
dictionary = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0,
'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0}
BED_path = f'{temp_path}BED.bed'
Log_path = f'{temp_path}bowtie2.log'
with open(BED_path, 'r') as bed:
for line in bed:
line_list = line.split('\t')
if line_list[-1] not in dictionary:
dictionary[line_list[-1]] = 1
else:
dictionary[line_list[-1]] += 1
# counts all appearing variable Regions
# sorts the Dictionary with all variable regions for extracting the most
# probable one
dictionary = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
most_probable_V = dictionary[0]
most_probable_V_count = most_probable_V[1]
most_probable_V = most_probable_V[0].strip('\n')
probability = round((most_probable_V_count/count)*100, 2)
dictionary[line_list[-1].strip('\n')] += 1
# counts all appearing variable Regions
with open(Log_path, 'r') as log:
lines = log.readlines()
unaligned_count = lines[2].strip('\n\t ')
all_reads = int(lines[0].strip('\n\t ').split()[0])
unaligned_count = lines[-1].strip('\n\t ')
unaligned_count = unaligned_count.split()[0].replace('%', '')
unaligned_count = round((100 - float(unaligned_count)), 2)
if file_type is None:
print(file_name + ': ')
print(unaligned_count)
print('The sequenced variable Region in this file is '
+ most_probable_V + ' with a probability of '
+ str(probability) + '%.')
print_output(dictionary, unaligned_count)
else:
create_output(path, file_name, unaligned_count, most_probable_V,
probability, dir_name, dir_path)
create_output(path, file_name, unaligned_count, dictionary,
dir_name, dir_path, temp_path, all_reads, mode)
40 changes: 26 additions & 14 deletions vxdetector/VXdetector.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
#!/usr/bin/python

import argparse
import os
import interact_bowtie2
import interact_samtools
import interact_bedtools
import os
import Output_counter
import files_manager


def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
dir_path):
dir_path, mode, read2_file):
interact_bowtie2.buildbowtie2(path)
if file_type is not None:
# The Programm bowtie2 is used to align the Reads to a reference
# 16S database.
interact_bowtie2.mapbowtie2(file_path, path, temp_path, file_type)
aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file,
path, temp_path, mode,
file_type)
else:
interact_bowtie2.mapbowtie2(file_path, path, temp_path,
file_type=' -q')
# convert the .sam file from bowtie2 to .bam for compability
count = interact_samtools.SAMtoBAM(path, temp_path)
aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file,
path, temp_path, mode,
file_type=' -q')
# look which reads intersect with which variable Region
interact_bedtools.overlap(path, temp_path)
interact_bedtools.overlap(path, temp_path, aligned_path)
# counts the Variable Regions that are found with bedtools and prints the
# highest probable variable Region
Output_counter.count(count, temp_path, file_name, file_type, path,
dir_name, dir_path)
Output_counter.count(temp_path, file_name, file_type, path,
dir_name, dir_path, mode)


def main():
Expand All @@ -41,7 +41,6 @@ def main():
help=('Filepath of the fastq file containing the '
'sequencences of which the variable regions '
'are to be verified.'))
# /homes/jgroos/Downloads/study_raw_data_14513_062122-034504/per_sample_FASTQ/147774/5001_S229_L001_R1_001.fastq
args = parser.parse_args()
path = files_manager.get_lib()
temp_path = files_manager.tmp_dir(path, temp_path='')
Expand All @@ -51,28 +50,41 @@ def main():
if args.fasta_file is None:
for root, dirs, files in os.walk(args.dir_path, topdown=True):
for file in files:
mode = 'unpaired'
read2_file = ''
if '_R2_' in file:
continue
if any(elements in file for elements in fastq_ext):
file_name = file
read2_file = os.path.join(root, file.replace('_R1_',
'_R2_'))
rev_exists = os.path.exists(read2_file)
if '_R1_' in file_name and rev_exists is True:
mode = 'paired'
dir_name = root
file_path = os.path.join(root, file)
file_type = ' -q'
elif any(elements in file for elements in fasta_ext):
file_name = file
read2_file = os.path.join(root, file.replace('_R1_',
'_R2_'))
rev_exists = os.path.exists(read2_file)
if '_R1_' in file_name and rev_exists is True:
mode = 'paired'
dir_name = root
file_path = os.path.join(root, file)
file_type = ' -f'
else:
continue
workflow(path, temp_path, file_path, file_type, file_name,
dir_name, args.dir_path)
dir_name, args.dir_path, mode, read2_file)
if file_type is None:
print('There were no FASTA or FASTQ files with 16S sequences '
'in this directory')
else:
workflow(path, temp_path, args.fasta_file, file_type,
file_name='Your file', dir_name='', dir_path='')
file_name='Your file', dir_name='', dir_path='',
mode='unpaired', read2_file='')
# quit() #keeps the tmp folder for troubleshooting
files_manager.tmp_dir(path, temp_path)

Expand Down
34 changes: 23 additions & 11 deletions vxdetector/files_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,46 @@


def tmp_dir(path, temp_path):
"""erstellt tmp verzeichnis; sollte es auch deleten"""
# erstellt tmp verzeichnis, fals es schon existiert
# wird es geloescht
if os.path.exists(temp_path):
shutil.rmtree(temp_path)
else:
temp_path = tempfile.mkdtemp(suffix=None, prefix='tmp_files_',
dir=path) + '/'
temp_path = f"{tempfile.mkdtemp(prefix='tmp_files_', dir=path)}/"

return temp_path


def get_lib():
"""findet die richtigen ordner/dateien, damit das programm funktioniert"""
programm_path = os.path.dirname(__file__) + '/'
if os.path.exists(programm_path+'Output/'):
# findet die richtigen ordner/dateien, damit das programm funktioniert
programm_path = f'{os.path.dirname(__file__)}'
programm_path = f"{programm_path.rsplit('/', 1)[0]}/"
if os.path.exists(f'{programm_path}Output/'):
pass
else:
os.mkdir(programm_path+'Output/')
if os.path.exists(programm_path+'Indexed_bt2/'):
if os.path.exists(programm_path+'Indexed_bt2/annoted_ref.bed'):
os.mkdir(f'{programm_path}Output/')
if os.path.exists(f'{programm_path}Indexed_bt2/'):
if os.path.exists(f'{programm_path}Indexed_bt2/annoted_ref.bed'):
pass
else:
print('ERROR: It seems the 16S variable region boundary'
' reference file is missing.')
if os.path.exists(f'{programm_path}Indexed_bt2/85_otus.fasta'):
pass
else:
print('ERROR: It seems the Greengenes "85_otus.fasta"'
'file is missing.')
if os.path.exists(f'{programm_path}Indexed_bt2/85_otus_aligned.fasta'):
pass
else:
print('ERROR: It seems the 16S variable region boundary reference '
'file is missing.')
if os.path.exists(programm_path+'Indexed_bt2/85_otus.fasta'):
if os.path.exists(f'{programm_path}Indexed_bt2/85_otus.fasta'):
pass
else:
print('ERROR: It seems the Greengenes "85_otus.fasta" file is'
' missing.')
if os.path.exists(programm_path+'Indexed_bt2/85_otus_aligned.fasta'):
if os.path.exists(f'{programm_path}Indexed_bt2/85_otus_aligned.fasta'):
pass
else:
print('ERROR: It seems the Greengenes "85_otus_aligned.fasta" '
Expand Down
23 changes: 15 additions & 8 deletions vxdetector/interact_bedtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,21 @@
import os


def overlap(path, temp_path):
def no_overlap(path, temp_path, aligned_path):
bedtools_filepath = '/usr/bin/bedtools'
BAM_filepath = temp_path + 'bam_sorted.bam'
S_ref = path + 'Indexed_bt2/annoted_ref.bed'
S_ref = f'{path}Indexed_bt2/annoted_ref.bed'
noOver_filepath = f'{temp_path}noOver.bed'
os.system(f'{bedtools_filepath} intersect -v -f 0.5 -a \
{aligned_path} -b {S_ref} > {noOver_filepath}')


def overlap(path, temp_path, aligned_path):
bedtools_filepath = '/usr/bin/bedtools'
S_ref = f'{path}Indexed_bt2/annoted_ref.bed'
# reference "genome" Created by using a aligned greengenes databank and
# deleting all "-" while keeping track where the variable Regions are
BED_filepath = temp_path + 'BED.bed'
# Intersects the .bam file with the reference
cmd = bedtools_filepath + ' intersect -f 0.5 -wb -a ' + BAM_filepath + \
' -b ' + S_ref + ' -bed > ' + BED_filepath
os.system(cmd)
BED_filepath = f'{temp_path}BED.bed'
# Intersects the aligned bowtie2 Output file with the reference
os.system(f'{bedtools_filepath} intersect -f 0.5 -a {S_ref} -b \
{aligned_path} > {BED_filepath}')
no_overlap(path, temp_path, aligned_path)
Loading