In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.10.1
pandas  1.1.4
numpy   1.19.5



In [23]:
def shannon_index(p):
#     print(p)
    return -np.sum(p*np.log(p))

In [24]:
v = np.array([3,3,3,3,3])

v = v / v.sum()

shannon_index(v)

1.6094379124341005

In [4]:
import numpy as np
from scipy.optimize import minimize, LinearConstraint

In [5]:
def minimize_shannon_index(num_reads: int, num_species: int, s: float, min_bound: int = 1_000):
    objective_function = lambda x: np.square(s - shannon_index(x))
    
    x0 = np.random.random(num_species)
    x0 = x0 / x0.sum()

    constraint = LinearConstraint(np.ones(num_species), lb=1, ub=1)

    bounds = [(min_bound / num_reads, 1 - ((num_species -1) * (min_bound / num_reads))) for n in range(num_species)]


    res = minimize(
        objective_function,
        x0=x0,
        constraints=constraint,
        bounds=bounds,
    )
    return res

In [6]:
num_reads = 10_000_000
num_species = 20
s = 1.663

res = minimize_shannon_index(num_reads, num_species, s)

In [7]:
shannon_index(res.x)

1.661826487041822

In [15]:
[1.663, (2.149 + 1.985) / 2, 2.665]

[1.663, 2.067, 2.665]

In [8]:
# columns name, assembly_accession, proportion, num_reads, dataset, file
names = ["stool", "skin", "saliva"]
l_shannon_index = [1.663, (2.149 + 1.985) / 2, 2.665]
num_replicates = 3
l_num_species = [20, 50, 100]
num_reads = 10_000_000

In [9]:
from glob import glob

import os

In [10]:
file_paths = "/mnt/btrfs/data/type_1/assembly_wgs_500_alignment/b6_capitalist_split_by_sample/*.b6"

In [11]:
d_sra_to_filepath = {}

df_merged_in_gtdb = pd.read_csv("../data/assembly_accessions_500.csv", index_col=0)
d_sra_to_closest_assembly_accession = dict(zip(df_merged_in_gtdb['Run'].values, df_merged_in_gtdb['closest_assembly_accession'].values))

for file in glob(file_paths):
    bn = os.path.basename(file)
    sra = ".".join(bn.split(".")[:1])
    if sra in d_sra_to_closest_assembly_accession:
        d_sra_to_filepath[sra] = file

In [12]:
import csv

In [13]:
csv.writer

<function _csv.writer>

In [14]:
import csv
import os

output_csv = "../data/simulation.truth.csv"

with open(output_csv, "w") as out_file:
    out_csv = csv.writer(out_file)
    out_csv.writerow(["file_location", "num_lines", "basename", "run", "file_location", "output_location", "closest_assembly_accession"])
    for name, s_i in zip(names, l_shannon_index):
        for num_species in l_num_species:
            for replicate in range(num_replicates*2):
                basename = f"{name}.{replicate}.{num_species}"
                output_location = f"../data/simulation.{basename}.b6"
                res = minimize_shannon_index(num_reads, num_species, s_i)
                l_read_distribution = (res.x * num_reads).astype(int)
                l_species = np.random.choice(np.array(list(d_sra_to_filepath.keys())), num_species, replace=False)
                for num_lines, species in zip(l_read_distribution, l_species):
                    file_location = d_sra_to_filepath[species]
                    !shuf -n {num_lines} -r {file_location} >> {output_location}
                    out_csv.writerow([os.path.abspath(file_location), num_lines, basename, species, os.path.abspath(file_location), os.path.abspath(output_location), d_sra_to_closest_assembly_accession[species]])
#                     break
#                 break
#             break
#         break
    




In [None]:
percent_coverage = [.2, .3, .2, .3, .01]

thresholds = [.1, .2, .3, .4, .5, .6, .7, .8]

for threshold in thresholds:
    fetch samples percent_coverage < threshold:
        precision(samples)