In [9]:
import pandas as pd
import numpy as np
from IPython.display import display
import csv
from collections import defaultdict
df = pd.read_csv("../data/references/miniGWG_darth/shear100/sheared_bayes.txt", sep="\t", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,k__Bacteria;p__Proteobacteria;c__Alphaproteoba...,0,1,48,20,12,241,2,92971,93299
1,k__Bacteria;p__Deinococcus-Thermus;c__Deinococ...,25,0,0,0,0,25,0,88888,89027
2,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...,0,20,45,41,29,928,0,124865,125928
3,k__Bacteria;p__Firmicutes;c__Erysipelotrichia;...,56,837,0,0,304,2,0,94217,96069
4,k__Viruses;p__ssRNA_viruses;c__;o__;f__Virgavi...,0,0,0,0,0,0,0,129,129


In [10]:
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(1, 9):
    # Sum all of the columns
    colsums = df[i].sum()
    # Take the sum of those columns
    num_hits = colsums.sum()
    # Total number of possible hits
    total_hits = df[9].sum()
    # Uniqueness Rate
    uniqueness_rate_per_level[i-1] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))

[('kingdom', 0.00024427088981178973),
 ('phylum', 0.00045203286459036219),
 ('class', 0.00037099261803100767),
 ('order', 0.0013770542337594339),
 ('family', 0.0045528001044508127),
 ('genus', 0.010950355120068945),
 ('species', 0.0039106679441064454),
 ('strain', 0.87604409173555464)]

In [11]:
# Sum all of the columns
colsums = df.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df[9].sum()
# Uniqueness Rate
num_hits/total_hits

0.89790226551037344

The kmer size is too large. For this experiment, we sheared 100 bps with a step size of 50. They just aren't unique enough at the strain level. Almost .87% of these reads are unique at the strain level. If we look at all levels, the uniqueness gets distributed only to 89%, with the highest uniquness being at the genus level with 1%.

Is it feasible for use to decrease the size of the kmers to 32bps? Or maybe even to 16bps?

In [25]:
# Get the size of the files
files_in_darth = !ls -alh "../data/references/miniGWG_darth/"
display(files_in_darth)

['total 285G',
 'drwxrwx--- 2 hillm096 knightslab  605 Jun 27 14:26 .',
 'drwxrwx--- 6 hillm096 knightslab  484 Jun 27 14:25 ..',
 '-rw-r----- 1 hillm096 knightslab  19G Jun 26 14:18 embalmer_otutable.fixed.txt',
 '-rw-r----- 1 hillm096 knightslab  11M Jun 23 10:34 embalmer_otutable.txt',
 '-rw-r----- 1 hillm096 knightslab  77G Jun 23 00:07 embalmer_results.b6',
 '-rw-r----- 1 hillm096 knightslab  77G Jun 26 12:01 embalmer_results.fixed_sample.b6',
 '-rw-r----- 1 hillm096 knightslab 2.3G Jun 26 14:18 embalmer_taxatable.fixed.txt',
 '-rw-r----- 1 hillm096 knightslab 1.5M Jun 23 10:34 embalmer_taxatable.txt',
 '-rw-r----- 1 hillm096 knightslab  16M Jun 22 15:09 miniGWG_darth_15.8.tax',
 '-rw-r----- 1 hillm096 knightslab    0 Jun 25 15:48 miniGWG_darth_15.fna',
 '-rw-r----- 1 hillm096 knightslab 7.9G Jun 22 16:23 miniGWG_darth_15.gg.utr',
 '-rw-r----- 1 hillm096 knightslab 1.5M Jun 22 16:23 miniGWG_darth_15.gg.utr.gg.log',
 '-rw-r----- 1 hillm096 knightslab  51G Jun 22 14:03 miniGWG_darth

So it would be ~200GB sheared file with a 32bps and 16bps sliding window. Is this feasible with embalmer?

In [14]:
df_32 = pd.read_csv("../data/references/miniGWG_darth/shear32/sheared_bayes.txt", sep="\t", header=None)
uniqueness_rate_per_level = np.zeros(8, dtype=float)
for i in range(1, 9):
    # Sum all of the columns
    colsums = df_32[i].sum()
    # Take the sum of those columns
    num_hits = colsums.sum()
    # Total number of possible hits
    total_hits = df_32[9].sum()
    # Uniqueness Rate
    uniqueness_rate_per_level[i-1] = num_hits/total_hits
levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
display(list(zip(levels, uniqueness_rate_per_level)))

[('kingdom', 0.0013420657585268855),
 ('phylum', 0.0011408068226012689),
 ('class', 0.001393227253969136),
 ('order', 0.001940149641640013),
 ('family', 0.0064696347476648252),
 ('genus', 0.014771946859077095),
 ('species', 0.0041695190973051882),
 ('strain', 0.68654350753079096)]

In [16]:
# Sum all of the columns
colsums = df_32.iloc[:,1:9].sum()
# Take the sum of those columns
num_hits = colsums.sum()
# Total number of possible hits
total_hits = df_32[9].sum()
# Uniqueness Rate
num_hits/total_hits

0.71777085771157545

In [None]:
hits_dict = defaultdict(list)
with open("../data/references/miniGWG_darth/shear32/embalmer_align.b6") as inf:
    csv_inf = csv.reader(inf, delimiter="\t")
    for row in csv_inf:
        if 't__' in row[-1]:
            hits_dict[row[-1]].append(int(row[8]))

with open("./hits_dict.keys", "w") as outf:
    for key, value in hits_dict.items():
        

In [None]:
len(list(hits_dict.keys()))