In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.11.1
numpy   1.19.5
pandas  1.1.4



In [2]:
import csv

def get_taxamap(inf_path) -> dict:
    taxa_map = dict()
    with open(inf_path) as inf:
        csv_reader = csv.reader(inf, delimiter="\t")
        for row in csv_reader:
            taxa_map[row[0]] = row[1]
    return taxa_map

  and should_run_async(code)


In [3]:
taxamap = get_taxamap("/mnt/btrfs/data/shogun/gtdb_95/r95.gtdb.tax")
reverse_taxamap = dict(zip(taxamap.values(), taxamap.keys()))

In [4]:
df_rerun = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_35_rerun/results-211221/results-211221/taxatable.species.txt", sep="\t")

df_rerun_t1 = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_35_rerun/results-211221/results-211221/results_type_1/predictions.csv", index_col=0)

df_rerun_t1["#OTU ID"] = df_rerun_t1["assembly_accession"].map(taxamap)

df_rerun_merge = pd.merge(df_rerun, df_rerun_t1[["#OTU ID", "ungapped_genome_length", "predictions"]], how="inner", on="#OTU ID")

df_rerun_merge_present = df_rerun_merge[df_rerun_merge["predictions"]].drop(columns=["predictions"])

df_rerun_merge_present = df_rerun_merge_present.set_index("#OTU ID")

df_rerun_merge_present = df_rerun_merge_present.loc[df_rerun_merge_present.apply(lambda x: np.any(x > 1000), axis=1), :]

correction_factor = (df_rerun_merge_present["ungapped_genome_length"] / df_rerun_merge_present["ungapped_genome_length"].mean())

df_rerun_merge_present = df_rerun_merge_present.drop(columns="ungapped_genome_length")

# drop low abundance samples
mask = df_rerun_merge_present.sum(axis=0) > 23500
df_rerun_merge_present = df_rerun_merge_present.loc[:, mask]

# correct for genome length
df_rerun_merge_present = df_rerun_merge_present.apply(lambda x: x / correction_factor, axis=0)

df_rerun_merge_present = df_rerun_merge_present.reset_index()

In [5]:
df = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_035/results-211221/results-211221/taxatable.species.txt", sep="\t")

df_t1 = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_035/results-211221/results-211221/results_type_1/predictions.csv", index_col=0)

df_t1["#OTU ID"] = df_t1["assembly_accession"].map(taxamap)

df_merge = pd.merge(df, df_t1[["#OTU ID", "ungapped_genome_length", "predictions"]], how="inner", on="#OTU ID")

df_merge_present = df_merge[df_merge["predictions"]].drop(columns=["predictions"])

df_merge_present = df_merge_present.set_index("#OTU ID")

df_merge_present = df_merge_present[df_merge_present.apply(lambda x: np.any(x > 1000), axis=1)]

correction_factor = (df_merge_present["ungapped_genome_length"] / df_merge_present["ungapped_genome_length"].mean())

df_merge_present = df_merge_present.drop(columns="ungapped_genome_length")

# drop low abundance samples
mask = df_merge_present.sum(axis=0) > 23500
df_merge_present = df_merge_present.loc[:, mask]

# correct for genome length
df_merge_present = df_merge_present.apply(lambda x: x / correction_factor, axis=0)

df_merge_present = df_merge_present.reset_index()

  and should_run_async(code)
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
s_all_taxa = set(df_rerun_merge_present["#OTU ID"]).union(set(df_merge_present["#OTU ID"]))

df_rerun_merge_present = df_rerun_merge_present.T
df_merge_present = df_merge_present.T

df_rerun_merge_present.columns = df_rerun_merge_present.iloc[0, :]
df_merge_present.columns = df_merge_present.iloc[0, :]


for taxa in s_all_taxa:
    if taxa:
        if not taxa in df_merge_present.columns:
            df_merge_present[taxa] = [taxa] + [0]*(df_merge_present.shape[0] - 1)
        if not taxa in df_rerun_merge_present.columns:
            df_rerun_merge_present[taxa] = [taxa] + [0]*(df_rerun_merge_present.shape[0] - 1)
df_rerun_merge_present = df_rerun_merge_present.T
df_merge_present = df_merge_present.T

df_rerun_merge_present = df_rerun_merge_present.reset_index(drop=True)
df_merge_present = df_merge_present.reset_index(drop=True)

df_rerun_merge_present = df_rerun_merge_present.sort_values("#OTU ID")
df_merge_present = df_merge_present.sort_values("#OTU ID")

  and should_run_async(code)


In [7]:
samples_rerun = [".".join(column.split(".")[:3]) if len(column.split(".")) > 2 and column.split(".")[3].startswith("S") else "" for column in df_rerun_merge_present.columns]

In [8]:
samples = [".".join(column.split(".")[:3]) if len(column.split(".")) > 2 and column.split(".")[3].startswith("S") else "" for column in df_merge_present.columns ]

In [9]:
d_rerun_samples = dict(zip(df_rerun_merge_present.columns, samples_rerun))

d_samples = dict(zip(samples, df_merge_present.columns))

In [10]:
for k_rerun, v_rerun in d_rerun_samples.items():
    if not k_rerun.startswith("#"):
        if v_rerun in d_samples:
            df_merge_present[d_samples[v_rerun]] = df_rerun_merge_present[k_rerun]
            df_merge_present.rename(columns = {d_samples[v_rerun]:k_rerun}, inplace = True)

In [11]:
# df_rerun_merge_present = df_rerun_merge_present.T

# df_rerun_merge_present["sample_num"] = samples_rerun

# df_rerun_merge_present = df_rerun_merge_present.T

In [12]:

# for sample_num in df_rerun_merge_present["sample_num"].values:
#     df_rerun_merge_present["sample_num"]

In [13]:
# df_rerun_merge_present.reset_index(drop=True)

In [14]:
df_merge_present = df_merge_present.iloc[(df_merge_present.set_index("#OTU ID").max(axis=1) > 1000).values]

In [15]:
df_merge_present

  and should_run_async(code)


Unnamed: 0,#OTU ID,MCT.f.0556.S358.R1.001.fa,MCT.f.0437.S306.R1.001.fa,MCT.f.0108.S291.R1.001.fa,MCT.f.0565.S67.R1.001.fa,MCT.f.0336.S208.R1.001.fa,MCT.f.0334.S316.R1.001.fa,MCT.f.0434.S477.R1.001.fa,MCT.f.0057.S371.R1.001.fa,MCT.f.0154.S465.R1.001.fa,...,Blank.B12.040.S408.R1.001.fa,MCT.f.0404.S475.R1.001.fa,MCT.f.0616.S278.R1.001.fa,MCT.f.0382.S441.R1.001.fa,MCT.f.0050.S372.R1.001.fa,MCT.f.0053.S40.L001.R1.001.fa,MCT.f.0036.S227.R1.001.fa,MCT.f.0495.S202.R1.001.fa,MCT.f.0579.S60.L001.R1.001.fa,MCT.f.0187.S438.R1.001.fa
0,k__Archaea;p__Thermoplasmatota;c__Thermoplasma...,0,0,0,0,374.753,844.36,0,0,0,...,0,0,0,0,0,4.59804,0,0,3801.05,0
1,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,22.9346,0,1.27415,0,0,0,0,0,1.27415,...,2.54829,0,0,0,0,4.34559,2.54829,0,2319.1,0
2,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,0,1.32335,0,0,1.32335,1.32335,0,1.32335,1.32335,...,0,0,0,0,1.32335,3528.83,3.97006,0,1083.63,0
3,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,2.45791,3.68687,11.0606,4.91583,14.7475,1.22896,0,0,7.37374,...,0,0,0,0,0,412.5,0,0,449.874,0
5,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,113.556,126811,2.87484,94.8699,53.1846,83.3705,7405.6,41.6852,317.67,...,12.9368,160.991,2.87484,60.3717,8.62453,0,31.6233,2.87484,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,k__Bacteria;p__Verrucomicrobiota;c__Lentisphae...,0,0,0,0,71.6319,37.549,0,0,0.577677,...,0,0,0,35.2383,0,0,0,0,0,53.7239
642,k__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
643,k__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,1279.68,54.1187,45.0989,11729.1,687.758,4684.65,6.76483,0,63.1384,...,31.5692,0,52.9912,183.778,0,0,1.12747,1380.03,0,0
644,k__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,41.7064,14567.2,13389.7,227.931,4.84959,62.0747,1724.51,0.969917,1.93983,...,32.9772,18.4284,0,1.93983,1.93983,0,0,8.72926,0,1.93983


In [16]:
np.array([value.startswith("k__Arch") for value in df_merge_present["#OTU ID"].values]).sum()

  and should_run_async(code)


1

In [17]:
df_merge_present_otu = df_merge_present.copy()
df_merge_present_otu["#OTU ID"] = df_merge_present_otu["#OTU ID"].map(reverse_taxamap)

In [18]:
df_merge_present.shape

(477, 445)

In [19]:
df_merge_present.to_csv("../data/species.gtdb98.txt", index=False, sep="\t")
df_merge_present_otu.to_csv("../data/otu.gtdb98.txt", index=False, sep="\t")