Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
.coverage
coverage.lcov
bowtie2.1.bt2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might want to be more flexible (as you don't know how many parts will be generated) by using wildcards, i.e. bowtie2.*.bt2

bowtie2.2.bt2
bowtie2.3.bt2
bowtie2.4.bt2
bowtie2.rev.1.bt2
bowtie2.rev.2.bt2
29 changes: 15 additions & 14 deletions Indexed_bt2/code_for_reference/create_annoted_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,18 @@
from os import path as p


def write(region, seq_chrom, annoted_ref):
region = {'V1_start': '', 'V1_end': '',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the reason why you "initialize" the dict with "empty" strings. Can't you check for presents of the key in downstream code?
e.g. if region['V1_start'] != '': --> if 'V1_start' in region: and once you set the value ensure that it is not empty?!

'V2_start': '', 'V2_end': '',
'V3_start': '', 'V3_end': '',
'V4_start': '', 'V4_end': '',
'V5_start': '', 'V5_end': '',
'V6_start': '', 'V6_end': '',
'V7_start': '', 'V7_end': '',
'V8_start': '', 'V8_end': '',
'V9_start': '', 'V9_end': ''}


def write(seq_chrom, annoted_ref):
with open(annoted_ref, 'a') as t:
if region['V1_start'] != '':
for counter, key in enumerate(region):
Expand All @@ -20,7 +31,7 @@ def write(region, seq_chrom, annoted_ref):
t.write(words)


def index(line, region):
def index(line):
boundary = {'V1_start': 189, 'V1_end': 471,
'V2_start': 485, 'V2_end': 1867,
'V3_start': 1915, 'V3_end': 2231,
Expand All @@ -36,29 +47,19 @@ def index(line, region):
list1 = ''.join(str(e) for e in list1)
list1 = list1.replace('-', '')
region[key] = int(list1.index('1')) + 1
return region


def main():
file_ = f'{p.dirname(p.dirname(__file__))}/85_otus_aligned.fasta'
annoted_ref = f'{p.dirname(p.dirname(__file__))}/annoted_ref.bed'
with open(file_, 'r') as f:
for line in f:
region = {'V1_start': '', 'V1_end': '',
'V2_start': '', 'V2_end': '',
'V3_start': '', 'V3_end': '',
'V4_start': '', 'V4_end': '',
'V5_start': '', 'V5_end': '',
'V6_start': '', 'V6_end': '',
'V7_start': '', 'V7_end': '',
'V8_start': '', 'V8_end': '',
'V9_start': '', 'V9_end': ''}
if line.startswith('>'):
seq_chrom = line[1:]
seq_chrom = seq_chrom.strip('\n')
else:
index(line, region)
write(region, seq_chrom, annoted_ref)
index(line)
write(seq_chrom, annoted_ref)


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ Requirements:
bowtie2
samtools
bedtools
Module: pandas
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ dependencies:
- flake8
- nose
- coverage >= 6 # to ensure lcov option is available
- pandas
45 changes: 45 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python

from setuptools import find_packages, setup

classifiers = [
'Development Status :: 1 - Pre-Alpha',
'License :: OSI Approved :: BSD License',
'Environment :: Console',
'Topic :: Software Development :: Libraries :: Application Frameworks',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Operating System :: Unix',
'Operating System :: POSIX',
'Operating System :: MacOS :: MacOS X',
'Operating System :: Microsoft :: Windows']


description = 'VXdetector'


setup(name='vxdetector',
version='1.0.0',
license='BSD',
#description=description,
#long_description=long_description,
#keywords=keywords,
classifiers=classifiers,
author="Johannes Groos",
author_email="Johannes.Groos@bio.uni-giessen.de",
maintainer="http://jlab.bio",
maintainer_email="stefan.m.janssen@gmail.com",
url='https://github.com/jlab/algorithm_vxdetector/',
test_suite='nose.collector',
packages=find_packages(),
install_requires=[
#'click >= 6',
#'scikit-bio >= 0.4.0',
],
#extras_require={'test': ["nose", "pep8", "flake8"],
# 'coverage': ["coverage"]}
)
134 changes: 56 additions & 78 deletions vxdetector/Output_counter.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
#!/usr/bin/python

import os
import csv
import pandas as pd
from itertools import (takewhile, repeat)

regions = {'V1': 0, 'V2': 0, 'V3': 0, 'V4': 0, 'V5': 0,
'V6': 0, 'V7': 0, 'V8': 0, 'V9': 0}

new_row = pd.DataFrame({'Read-file': [''], 'Number of Reads': [0],
'Unaligned Reads [%]': [0],
'Not properly paired': ['unpaired'],
'Sequenced variable region': ['V'],
'V1': [0], 'V2': [0], 'V3': [0],
'V4': [0], 'V5': [0], 'V6': [0],
'V7': [0], 'V8': [0], 'V9': [0],
'Not aligned to a variable region': [0]})
# declares the dictionary containing all variable regions as
# a global variable. That way it can be easily accessed and modified


def directory_navi(file_name, path, dir_name, dir_path):
dir_path = dir_name.replace(dir_path, '', 1)
if dir_name.endswith('/'):
dir_name = dir_name.rstrip('/')
# only leaves the part in the directory tree between
# the file and the original given directory path
dir_name = os.path.basename(dir_name)
Expand All @@ -24,104 +33,71 @@ def directory_navi(file_name, path, dir_name, dir_path):
file_name = file_name.rsplit('.', 2)[0]
else:
file_name = file_name.rsplit('.', 1)[0]

return file_name, dir_name, dir_path


def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024)
for _ in repeat(None)))

return sum(buf.count(b'\n') for buf in bufgen)


def region_count(unaligned_count, temp_path, mode, all_reads):
def region_count(temp_path, mode):
if mode == 'paired':
unpaired = round(rawincount(f'{temp_path}bed.log')/all_reads, 2)
new_row['Not properly paired'] = rawincount(
f'{temp_path}bed.log') \
/ new_row['Number of Reads']
# counts the amount of error messages given by bedtools
# during bam to bed conversion each error message reprensents
# one improperly paired read
elif mode == 'unpaired':
unpaired = 'unpaired Reads'
no_V = rawincount(f'{temp_path}noOver.bed')
new_row['Not aligned to a variable region'] = rawincount(
f'{temp_path}noOver.bed')
# counts the number of reads not mapped to any variable region
aligned_count = 100 - unaligned_count
count = sum([regions['V1'], regions['V2'], regions['V3'],
regions['V4'], regions['V5'], regions['V6'],
regions['V7'], regions['V8'], regions['V9'], no_V])
for key in regions:
regions[key] = (regions[key] / count) * aligned_count
no_V = (no_V/count)*aligned_count
aligned_count = 100 - new_row.iat[0, 2]
count = sum(new_row.iloc[0, 5:])
for column in new_row.columns[5:]:
new_row[column] = (new_row[column] / count) * aligned_count
# Calculates the percantage of reads mapped to either a specific
# region or no region
if aligned_count == 0 or (no_V/aligned_count) == 1:
most_probable_V = 'No variable Region'
if aligned_count == 0 or (new_row.iat[0, 14]/aligned_count) == 1:
new_row['Sequenced variable region'] = 'No variable Region'
else:
most_probable_V = [x[0].replace('V', '') for x in
sorted(list(regions.items()))
if ((x[1] / aligned_count) * 100) > 20]
most_probable_V = 'V' + ''.join(most_probable_V)
if most_probable_V == 'V':
most_probable_V = 'No variable Region'
most_probable_V = [x.replace('V', '') for x in
new_row.columns[5:14]
if ((new_row.at[0, x] / aligned_count) * 100) > 20]
new_row['Sequenced variable region'] = 'V' + ''.join(most_probable_V)
if new_row.iat[0, 4] == 'V':
new_row['Sequenced variable region'] = 'No variable Region'
# finds the variable regions with the highest probability
# any region having a percentage of 20% or higher will be listed.
# The percentage is calculated within the regions.

return most_probable_V, unpaired, no_V


def create_output(path, file_name, unaligned_count, dir_name,
dir_path, temp_path, all_reads, mode):
def create_row(path, file_name, dir_name, dir_path, temp_path, mode):
file_name, dir_name, dir_path = directory_navi(file_name, path,
dir_name, dir_path)
if mode == 'paired':
file_name = file_name.replace('_R1_001', '')
new_row['Read-file'] = file_name.replace('_R1_001', '')
new_file = f'{dir_path}/{dir_name}.csv'
# The new file is named after the directory containing the reads
most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
mode, all_reads)
if os.path.exists(new_file):
header = True
else:
header = False
# The header will only be written if it wasnt written before
foo = {'file_name': file_name, 'all_reads': all_reads,
'unaligned_count': unaligned_count, 'unpaired': unpaired,
'no_V': no_V, 'most_probable_V': most_probable_V}
foo.update(regions)
with open(new_file, 'a', newline='') as o:
fieldnames = ['Read-file', 'Number of Reads', 'Unaligned Reads [%]',
'Not properly paired', 'Sequenced variable region', 'V1',
'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
'Not aligned to a variable region']
writer = csv.DictWriter(o, fieldnames=fieldnames)
if header is False:
writer.writeheader()
for key in foo:
if isinstance(foo[key], float):
foo[key] = round(foo[key], 2)
writer.writerow({'Read-file': foo['file_name'], 'Number of Reads':
foo['all_reads'], 'Unaligned Reads [%]':
foo['unaligned_count'], 'Not properly paired':
foo['unpaired'], 'Sequenced variable region':
foo['most_probable_V'], 'V1': foo['V1'],
'V2': foo['V2'], 'V3': foo['V3'],
'V4': foo['V4'], 'V5': foo['V5'],
'V6': foo['V6'], 'V7': foo['V7'],
'V8': foo['V8'], 'V9': foo['V9'],
'Not aligned to a variable region': foo['no_V']})


def print_output(unaligned_count, temp_path, mode):
most_probable_V, unpaired, no_V = region_count(unaligned_count, temp_path,
mode, all_reads='')
print(f'\n{str(unaligned_count)}% were unaligned.')
print(f'Of all the aligned Reads most were aligned to: {most_probable_V}')
region_count(temp_path, mode)
for column in new_row:
if isinstance(new_row.at[0, column], float):
new_row.at[0, column] = round(new_row.at[0, column], 2)
return new_file


def print_output(temp_path, mode):
region_count(temp_path, mode)
print(f"\n{round(new_row.iat[0, 2], 2)}% were unaligned.")
print(f"Of all the aligned Reads most were aligned to: \
{new_row.iat[0, 4]}")
print('The probabilities of all regions is as follows [%]:')
print('\n'.join(':'.join((key, str(round(val, 2))))
for (key, val) in regions.items()))
print(f'\nAnd {round(no_V, 2)}% were not mapped to any variable region\n')
print('\n'.join(':'.join((column, str(round(new_row.at[0, column], 2))))
for column in new_row.columns[5:14]))
print(f"\nAnd {round(new_row.iat[0, -1], 2)}% were not mapped \
to any variable region\n")


def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
Expand All @@ -130,20 +106,22 @@ def count(temp_path, file_name, file_type, path, dir_name, dir_path, mode):
with open(BED_path, 'r') as bed:
for line in bed:
line_list = line.split('\t')
regions[line_list[-1].strip('\n')] += 1
new_row[line_list[-1].strip('\n')] += 1
# counts all appearing variable Regions
with open(Log_path, 'r') as log:
lines = log.readlines()
all_reads = int(lines[0].strip('\n\t ').split()[0])
new_row['Number of Reads'] = int(lines[0].strip('\n\t ').split()[0])
unaligned_count = lines[-1].strip('\n\t ')
# Reads the bowtie2 stdout which is normally seen in the terminal
unaligned_count = unaligned_count.split()[0].replace('%', '')
unaligned_count = 100 - float(unaligned_count)
new_row['Unaligned Reads [%]'] = 100 - float(unaligned_count)
if file_type is None:
print_output(unaligned_count, temp_path, mode)
print_output(temp_path, mode)
# should file_type be None only a single file was given
# therefore output will be printed in the terminal rather
# than in a file
else:
create_output(path, file_name, unaligned_count,
dir_name, dir_path, temp_path, all_reads, mode)
new_file = create_row(path, file_name, dir_name, dir_path,
temp_path, mode)
if file_type is not None:
return new_row, new_file
Loading