In [1]:
from skbio import TreeNode
import gzip
from collections import Counter, deque
import time
import numpy as np
import pandas as pd
def extract_consenus(file):
    f = open(file)
    g_id = f.readline()[1:].strip()
    seq = ""
    lines = iter(f)
    for line in lines:
        if line[0] == '>':
            yield (g_id, seq)
            g_id = line[1:].strip()
            seq = "" 
        else:
            seq += line.strip()

def extract_sample_read_counts(file):
    f = gzip.open(file, 'rt')
    s_count = Counter()
    i = 0
    for idx, line in enumerate(f):
        if idx % 4 == 1:
            s_count[line.strip()] += 1
    
    return s_count

In [13]:
consenus_genome = deque(
    extract_consenus("2021-08-23_21-01-26-all_stringent_only.fas"))
genomes = [genome for genome, _ in consenus_genome]
sequences = [sequence for _, sequence in consenus_genome]
d = {'genome': genomes, 'sequences': sequences}
consensus_genome_df = pd.DataFrame(data=d)
consensus_genome_df['genome'] = consensus_genome_df.genome.str.replace('_',' ')

# The following is a POC for a single sample

In [14]:
# sample 38879
r1 = [(count, read) for read, count in extract_sample_read_counts(
        "SEARCH-38879__E0001197__P24__210924_A01535_0019_BHT7MHDSX2__S736_L004_R1_001.fastq.gz").items() 
          if count > 1]
r2 = [(count, seq) for seq, count in extract_sample_read_counts(
        "SEARCH-38879__E0001197__P24__210924_A01535_0019_BHT7MHDSX2__S736_L004_R2_001.fastq.gz").items()
           if count > 1]
tree = TreeNode.read("2021-08-23_21-01-26-all_stringent_refs_hist.trimmed.aln.rooted.treefile",)

## get genomes per amplicon/feature table data

In [16]:
t1 = time.process_time()
observed = {} # keeps track of which amplicons we have seen 
tips_per_read = [] # stores which genomes each unique amplicon is found in
f_table_staged = []

# TODO: will need to iterate over all samples
for count, read in r1 + r2: # TODO: will need to create function to get the r1 and r2 reads per sample
    if count > 10000: # just to test, the lower the value => more amplicons => much slower time
        f_table_staged.append(("sample_id_38879", read, count)) # add sample/amplicon to feature table
        
        if read in observed: # amplicon was already seen in another sample
            continue
        
        found = consensus_genome_df.loc[
            consensus_genome_df.sequences.str.contains(read)
        ]
        
        # currently discarding amplicon if its not found in atleast 1 genome
        # Should we keep it??? 
        if found.size > 0: 
            tips_per_read.append((read, set(found.genome))) 
t2 = time.process_time()
print(t2-t1)

13.959846821


In [17]:
tips_per_read

[('CCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAA',
  {'Consensus SEARCH-18283  D101855  M13  210426 A00953 0286 BH7KLNDRXY  002.trimmed.sorted.pileup.consensus threshold 0.5 quality 20',
   'Consensus SEARCH-20991  D101800  K03  210608 A00953 0321 BH7L5LDSX2  001.trimmed.sorted.pileup.consensus threshold 0.5 quality 20',
   'Consensus SEARCH-10202  D103683  P07  210301 A00953 0244 AHYHYWDSXY  002.trimmed.sorted.pileup.consensus threshold 0.5 quality 20',
   'Consensus SEARCH-15074  D104940  H18  210323 A00953 0264 AH2VVJDSX2  004.trimmed.sorted.pileup.consensus threshold 0.5 quality 20',
   'Consensus SEARCH-11059  D103683  I23  210301 A00953 0244 AHYHYWDSXY  002.trimmed.sorted.pileup.consensus threshold 0.5 quality 20',
   'Consensus SEARCH-15115  D104940  E24  210323 A00953 0264 AH2VVJDSX2  004.trimmed.sorted.pileup.consensus threshold 0.5 quality 20',
   'Consensus SEARCH-39150 

## insert amplicons into tree via lca of the genomes they were found in

In [21]:
t1 = time.process_time()
for read, tips in tips_per_read:
    if read not in cur_tips:
        node = tree.lca(list(tips))
        node.append(TreeNode(name=read))
t2 = time.process_time()
print(t2-t1)

146.282803792
