Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
.coverage
coverage.lcov
bowtie2.1.bt2
bowtie2.2.bt2
bowtie2.3.bt2
bowtie2.4.bt2
bowtie2.rev.1.bt2
bowtie2.rev.2.bt2
29 changes: 15 additions & 14 deletions Indexed_bt2/code_for_reference/create_annoted_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,18 @@
from os import path as p


def write(region, seq_chrom, annoted_ref):
region = {'V1_start': '', 'V1_end': '',
'V2_start': '', 'V2_end': '',
'V3_start': '', 'V3_end': '',
'V4_start': '', 'V4_end': '',
'V5_start': '', 'V5_end': '',
'V6_start': '', 'V6_end': '',
'V7_start': '', 'V7_end': '',
'V8_start': '', 'V8_end': '',
'V9_start': '', 'V9_end': ''}


def write(seq_chrom, annoted_ref):
with open(annoted_ref, 'a') as t:
if region['V1_start'] != '':
for counter, key in enumerate(region):
Expand All @@ -20,7 +31,7 @@ def write(region, seq_chrom, annoted_ref):
t.write(words)


def index(line, region):
def index(line):
boundary = {'V1_start': 189, 'V1_end': 471,
'V2_start': 485, 'V2_end': 1867,
'V3_start': 1915, 'V3_end': 2231,
Expand All @@ -36,29 +47,19 @@ def index(line, region):
list1 = ''.join(str(e) for e in list1)
list1 = list1.replace('-', '')
region[key] = int(list1.index('1')) + 1
return region


def main():
file_ = f'{p.dirname(p.dirname(__file__))}/85_otus_aligned.fasta'
annoted_ref = f'{p.dirname(p.dirname(__file__))}/annoted_ref.bed'
with open(file_, 'r') as f:
for line in f:
region = {'V1_start': '', 'V1_end': '',
'V2_start': '', 'V2_end': '',
'V3_start': '', 'V3_end': '',
'V4_start': '', 'V4_end': '',
'V5_start': '', 'V5_end': '',
'V6_start': '', 'V6_end': '',
'V7_start': '', 'V7_end': '',
'V8_start': '', 'V8_end': '',
'V9_start': '', 'V9_end': ''}
if line.startswith('>'):
seq_chrom = line[1:]
seq_chrom = seq_chrom.strip('\n')
else:
index(line, region)
write(region, seq_chrom, annoted_ref)
index(line)
write(seq_chrom, annoted_ref)


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ Requirements:
bowtie2
samtools
bedtools
Module: pandas
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ dependencies:
- flake8
- nose
- coverage >= 6 # to ensure lcov option is available
- pandas
134 changes: 56 additions & 78 deletions vxdetector/Output_counter.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
#!/usr/bin/python

import os
import csv
import pandas as pd
from itertools import (takewhile, repeat)

regions = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0,
'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0}

new_row = pd.DataFrame({'Read-file': [''], 'Number of Reads': [0],
'Unaligned Reads [%]': [0],
'Not properly paired': ['unpaired'],
'Sequenced variable region': ['V'],
'V1': [0], 'V2': [0], 'V3': [0],
'V4': [0], 'V5': [0], 'V6': [0],
'V7': [0], 'V8': [0], 'V9': [0],
'Not aligned to a variable region': [0]})
# declares the dictionary containing all variable regions as
# a global variable. That way it can be easily accessed and modified


def directory_navi(file_name, path, dir_name, dir_path):
dir_path = dir_name.replace(dir_path, '', 1)
if dir_name.endswith('/'):
dir_name = dir_name.rstrip('/')
# only leaves the part in the directory tree between
# the file and the original given directory path
dir_name = os.path.basename(dir_name)
Expand All @@ -24,104 +33,71 @@ def directory_navi(file_name, path, dir_name, dir_path):
file_name = file_name.rsplit('.', 2)[0]
else:
file_name = file_name.rsplit('.', 1)[0]

return file_name, dir_name, dir_path


def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024)
for _ in repeat(None)))

return sum(buf.count(b'\n') for buf in bufgen)


def region_count(unaligned_count, temp_path, mode, all_reads):
def region_count(temp_path, mode):
if mode == 'paired':
unpaired = round(rawincount(f'{temp_path}bed.log')/all_reads, 2)
new_row['Not properly paired'] = rawincount(
f'{temp_path}bed.log') \
/ new_row['Number of Reads']
# counts the amount of error messages given by bedtools
# during bam to bed conversion each error message reprensents
# one improperly paired read
elif mode == 'unpaired':
unpaired = 'unpaired Reads'
no_V = rawincount(f'{temp_path}noOver.bed')
new_row['Not aligned to a variable region'] = rawincount(
f'{temp_path}noOver.bed')
# counts the number of reads not mapped to any variable region
aligned_count = 100 - unaligned_count
count = sum([regions['V1'], regions['V2'], regions['V3'],
regions['V4'], regions['V5'], regions['V6'],
regions['V7'], regions['V8'], regions['V9'], no_V])
for key in regions:
regions[key] = (regions[key] / count) * aligned_count
no_V = (no_V/count)*aligned_count
aligned_count = 100 - new_row.iat[0, 2]
count = sum(new_row.iloc[0, 5:])
for column in new_row.columns[5:]:
new_row[column] = (new_row[column] / count) * aligned_count
# Calculates the percantage of reads mapped to either a specific
# region or no region
if aligned_count == 0 or (no_V/aligned_count) == 1:
most_probable_V = 'No variable Region'
if aligned_count == 0 or (new_row.iat[0, 14]/aligned_count) == 1:
new_row['Sequenced variable region'] = 'No variable Region'
else:
most_probable_V = [x[0].replace('V', '') for x in
sorted(list(regions.items()))
if ((x[1] / aligned_count) * 100) > 20]
most_probable_V = 'V' + ''.join(most_probable_V)
if most_probable_V == 'V':
most_probable_V = 'No variable Region'
most_probable_V = [x.replace('V', '') for x in
new_row.columns[5:14]
if ((new_row.at[0, x] / aligned_count) * 100) > 20]
new_row['Sequenced variable region'] = 'V' + ''.join(most_probable_V)
if new_row.iat[0, 4] == 'V':
new_row['Sequenced variable region'] = 'No variable Region'
# finds the variable regions with the highest probability
# any region having a percentage of 20% or higher will be listed.
# The percentage is calculated within the regions.

return most_probable_V, unpaired, no_V


def create_output(path, file_name, unaligned_count, dir_name,
dir_path, temp_path, all_reads, mode):
def create_row(path, file_name, dir_name, dir_path, temp_path, mode):
file_name, dir_name, dir_path = directory_navi(file_name, path,
dir_name, dir_path)
if mode == 'paired':
file_name = file_name.replace('_R1_001', '')
new_row['Read-file'] = file_name.replace('_R1_001', '')
new_file = f'{dir_path}/{dir_name}.csv'
# The new file is named after the directory containing the reads
most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
mode, all_reads)
if os.path.exists(new_file):
header = True
else:
header = False
# The header will only be written if it wasnt written before
foo = {'file_name': file_name, 'all_reads': all_reads,
'unaligned_count': unaligned_count, 'unpaired': unpaired,
'no_V': no_V, 'most_probable_V': most_probable_V}
foo.update(regions)
with open(new_file, 'a', newline='') as o:
fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]',
'Not properly paired', 'Sequenced variable region', 'V1',
'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
'Not aligned to a variable region']
writer = csv.DictWriter(o, fieldnames=fieldnames)
if header is False:
writer.writeheader()
for key in foo:
if isinstance(foo[key], float):
foo[key] = round(foo[key], 2)
writer.writerow({'Read-file': foo['file_name'], 'Number of Reads':
foo['all_reads'], 'Unaligned Reads [%]':
foo['unaligned_count'], 'Not properly paired':
foo['unpaired'], 'Sequenced variable region':
foo['most_probable_V'], 'V1': foo['V1'],
'V2': foo['V2'], 'V3': foo['V3'],
'V4': foo['V4'], 'V5': foo['V5'],
'V6': foo['V6'], 'V7': foo['V7'],
'V8': foo['V8'], 'V9': foo['V9'],
'Not aligned to a variable region': foo['no_V']})


def print_output(unaligned_count, temp_path, mode):
most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
mode, all_reads='')
print(f'\n{str(unaligned_count)}% were unaligned.')
print(f'Of all the aligned Reads most were aligned to: {most_probable_V}')
region_count(temp_path, mode)
for column in new_row:
if isinstance(new_row.at[0, column], float):
new_row.at[0, column] = round(new_row.at[0, column], 2)
return new_file


def print_output(temp_path, mode):
region_count(temp_path, mode)
print(f"\n{round(new_row.iat[0, 2], 2)}% were unaligned.")
print(f"Of all the aligned Reads most were aligned to: \
{new_row.iat[0, 4]}")
print('The probabilities of all regions is as follows [%]:')
print('\n'.join(':'.join((key, str(round(val, 2))))
for (key, val) in regions.items()))
print(f'\nAnd {round(no_V, 2)}% were not mapped to any variable region\n')
print('\n'.join(':'.join((column, str(round(new_row.at[0, column], 2))))
for column in new_row.columns[5:14]))
print(f"\nAnd {round(new_row.iat[0, -1], 2)}% were not mapped \
to any variable region\n")


def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
Expand All @@ -130,20 +106,22 @@ def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
with open(BED_path, 'r') as bed:
for line in bed:
line_list = line.split('\t')
regions[line_list[-1].strip('\n')] += 1
new_row[line_list[-1].strip('\n')] += 1
# counts all appearing variable Regions
with open(Log_path, 'r') as log:
lines = log.readlines()
all_reads = int(lines[0].strip('\n\t ').split()[0])
new_row['Number of Reads'] = int(lines[0].strip('\n\t ').split()[0])
unaligned_count = lines[-1].strip('\n\t ')
# Reads the bowtie2 stdout which is normally seen in the terminal
unaligned_count = unaligned_count.split()[0].replace('%', '')
unaligned_count = 100 - float(unaligned_count)
new_row['Unaligned Reads [%]'] = 100 - float(unaligned_count)
if file_type is None:
print_output(unaligned_count, temp_path, mode)
print_output(temp_path, mode)
# should file_type be None only a single file was given
# therefore output will be printed in the terminal rather
# than in a file
else:
create_output(path, file_name, unaligned_count,
dir_name, dir_path, temp_path, all_reads, mode)
new_file = create_row(path, file_name, dir_name, dir_path,
temp_path, mode)
if file_type is not None:
return new_row, new_file
66 changes: 59 additions & 7 deletions vxdetector/VXdetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,41 @@
import interact_bedtools
import Output_counter
import files_manager
import pandas as pd

output = pd.DataFrame({'Read-file': [], 'Number of Reads': [],
'Unaligned Reads [%]': [],
'Not properly paired': [],
'Sequenced variable region': [],
'V1': [], 'V2': [], 'V3': [],
'V4': [], 'V5': [], 'V6': [],
'V7': [], 'V8': [], 'V9': [],
'Not aligned to a variable region': []})

def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
dir_path, mode, read2_file):

def do_statistic():
global output
avarage = output.mean(numeric_only=True).round(2).to_frame().T
avarage['Read-file'] = ['Average']
region = (output.iloc[:, [4]].mode().values)
region = ' / '.join(str(r) for r in region)
region = region.replace('\'', '')
region = region.replace('[', '')
region = region.replace(']', '')
avarage['Sequenced variable region'] = region
std_dev = output.std(numeric_only=True).round(2).to_frame().T
std_dev['Read-file'] = ['Standard deviation']
statistic = pd.concat([avarage, std_dev], axis=0)
statistic = statistic[['Read-file', 'Number of Reads',
'Unaligned Reads [%]', 'Not properly paired',
'Sequenced variable region', 'V1', 'V2', 'V3',
'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
'Not aligned to a variable region']]
output = pd.concat([statistic, output], axis=0)


def workflow(path, temp_path, file_path, file_type, old_file_name,
file_name, dir_name, dir_path, mode, read2_file):
interact_bowtie2.buildbowtie2(path)
if file_type is not None:
aligned_path = interact_bowtie2.mapbowtie2(file_path, read2_file,
Expand All @@ -23,8 +54,22 @@ def workflow(path, temp_path, file_path, file_type, file_name, dir_name,
# 16S database.
interact_bedtools.overlap(path, temp_path, aligned_path)
# look which reads intersect with which variable Region
Output_counter.count(temp_path, file_name, file_type, path,
dir_name, dir_path, mode)
if file_type is not None:
global output
new_row, new_file = Output_counter.count(temp_path, file_name,
file_type, path, dir_name,
dir_path, mode)
if new_file != old_file_name and old_file_name is not None:
output.sort_values(by=['Read-file'], inplace=True)
do_statistic()
output.to_csv(old_file_name, index=False)
output = output.iloc[0:0]
output = pd.concat([output, new_row], axis=0)
old_file_name = new_file
return new_file, old_file_name
else:
Output_counter.count(temp_path, file_name, file_type, path,
dir_name, dir_path, mode)
# counts the Variable Regions that are found with bedtools and prints the
# highest probable variable Region (single file) or
# writes a new file (directory)
Expand All @@ -50,6 +95,7 @@ def main():
file_type = None
fasta_ext = ('.fasta', '.fa', '.ffa', '.ffn', '.faa', '.frn')
fastq_ext = ('.fq', '.fastq',)
old_file_name = None
if args.fasta_file is None:
# If a directory was given as input the programm will walk through
# that directory and search for forward reads.
Expand Down Expand Up @@ -82,16 +128,22 @@ def main():
file_type = ' -f'
else:
continue
workflow(path, temp_path, file_path, file_type, file_name,
dir_name, args.dir_path, mode, read2_file)
new_file, old_file_name = workflow(path, temp_path, file_path,
file_type, old_file_name,
file_name, dir_name,
args.dir_path, mode,
read2_file)
if file_type is None:
print('There were no FASTA or FASTQ files with 16S sequences '
'in this directory')
else:
workflow(path, temp_path, args.fasta_file, file_type,
workflow(path, temp_path, args.fasta_file, file_type, old_file_name,
file_name='Your file', dir_name='', dir_path='',
mode='unpaired', read2_file='')
files_manager.tmp_dir(path, temp_path)
if old_file_name is not None:
do_statistic()
output.to_csv(old_file_name, index=False)


if __name__ == '__main__':
Expand Down