In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy   1.19.5
seaborn 0.11.1
pandas  1.1.4



In [2]:
import csv

def get_taxamap(inf_path) -> dict:
    taxa_map = dict()
    with open(inf_path) as inf:
        csv_reader = csv.reader(inf, delimiter="\t")
        for row in csv_reader:
            taxa_map[row[0]] = row[1]
    return taxa_map

  and should_run_async(code)


In [3]:
taxamap = get_taxamap("/mnt/btrfs/data/shogun/gtdb_95/r95.gtdb.tax")
reverse_taxamap = dict(zip(taxamap.values(), taxamap.keys()))

In [4]:
df_rerun = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_35_rerun/results-211221/results-211221/taxatable.species.txt", sep="\t")

df_rerun_t1 = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_35_rerun/results-211221/results-211221/results_type_1/predictions.csv", index_col=0)

df_rerun_t1["#OTU ID"] = df_rerun_t1["assembly_accession"].map(taxamap)

df_rerun_merge = pd.merge(df_rerun, df_rerun_t1[["#OTU ID", "ungapped_genome_length", "predictions"]], how="inner", on="#OTU ID")

df_rerun_merge_present = df_rerun_merge[df_rerun_merge["predictions"]].drop(columns=["predictions"])

df_rerun_merge_present = df_rerun_merge_present.set_index("#OTU ID")

df_rerun_merge_present = df_rerun_merge_present.loc[df_rerun_merge_present.apply(lambda x: np.any(x > 1000), axis=1), :]

correction_factor = (df_rerun_merge_present["ungapped_genome_length"] / df_rerun_merge_present["ungapped_genome_length"].mean())

df_rerun_merge_present = df_rerun_merge_present.drop(columns="ungapped_genome_length")

# drop low abundance samples
mask = df_rerun_merge_present.sum(axis=0) > 23500
df_rerun_merge_present = df_rerun_merge_present.loc[:, mask]

# correct for genome length
df_rerun_merge_present = df_rerun_merge_present.apply(lambda x: x / correction_factor, axis=0)
    
df_rerun_merge_present = df_rerun_merge_present.reset_index()

In [5]:
df_rerun_merge_present.shape

  and should_run_async(code)


(293, 69)

In [6]:
df = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_035/results-211221/results-211221/taxatable.species.txt", sep="\t")

df_t1 = pd.read_csv("/mnt/btrfs/data/foodworks/Knights_Project_35_rerun/results-211221/results-211221/results_type_1/predictions.csv", index_col=0)

df_t1["#OTU ID"] = df_t1["assembly_accession"].map(taxamap)

df_merge = pd.merge(df, df_t1[["#OTU ID", "ungapped_genome_length", "predictions"]], how="inner", on="#OTU ID")

df_merge_present = df_merge[df_merge["predictions"]].drop(columns=["predictions"])

df_merge_present = df_merge_present.set_index("#OTU ID")

df_merge_present = df_merge_present[df_merge_present.apply(lambda x: np.any(x > 1000), axis=1)]

correction_factor = (df_merge_present["ungapped_genome_length"] / df_merge_present["ungapped_genome_length"].mean())

df_merge_present = df_merge_present.drop(columns="ungapped_genome_length")

# drop low abundance samples
mask = df_merge_present.sum(axis=0) > 23500
df_merge_present = df_merge_present.loc[:, mask]

# correct for genome length
df_merge_present = df_merge_present.apply(lambda x: x / correction_factor, axis=0)

df_merge_present = df_merge_present.reset_index()

In [7]:
s_all_taxa = set(df_rerun_merge_present["#OTU ID"]).union(set(df_merge_present["#OTU ID"]))

df_rerun_merge_present = df_rerun_merge_present.T
df_merge_present = df_merge_present.T

df_rerun_merge_present.columns = df_rerun_merge_present.iloc[0, :]
df_merge_present.columns = df_merge_present.iloc[0, :]


for taxa in s_all_taxa:
    if taxa:
        if not taxa in df_merge_present.columns:
            df_merge_present[taxa] = [taxa] + [0]*(df_merge_present.shape[0] - 1)
        if not taxa in df_rerun_merge_present.columns:
            df_rerun_merge_present[taxa] = [taxa] + [0]*(df_rerun_merge_present.shape[0] - 1)
df_rerun_merge_present = df_rerun_merge_present.T
df_merge_present = df_merge_present.T

df_rerun_merge_present = df_rerun_merge_present.reset_index(drop=True)
df_merge_present = df_merge_present.reset_index(drop=True)

df_rerun_merge_present = df_rerun_merge_present.sort_values("#OTU ID")
df_merge_present = df_merge_present.sort_values("#OTU ID")

  and should_run_async(code)


In [8]:
samples_rerun = [".".join(column.split(".")[:3]) if len(column.split(".")) > 2 and column.split(".")[3].startswith("S") else "" for column in df_rerun_merge_present.columns]

In [9]:
samples = [".".join(column.split(".")[:3]) if len(column.split(".")) > 2 and column.split(".")[3].startswith("S") else "" for column in df_merge_present.columns ]

In [10]:
d_rerun_samples = dict(zip(df_rerun_merge_present.columns, samples_rerun))

d_samples = dict(zip(samples, df_merge_present.columns))

In [11]:
for k_rerun, v_rerun in d_rerun_samples.items():
    if not k_rerun.startswith("#"):
        if v_rerun in d_samples:
            df_merge_present[d_samples[v_rerun]] = df_rerun_merge_present[k_rerun]
            df_merge_present.rename(columns = {d_samples[v_rerun]:k_rerun}, inplace = True)

In [12]:
# df_rerun_merge_present = df_rerun_merge_present.T

# df_rerun_merge_present["sample_num"] = samples_rerun

# df_rerun_merge_present = df_rerun_merge_present.T

In [13]:

# for sample_num in df_rerun_merge_present["sample_num"].values:
#     df_rerun_merge_present["sample_num"]

In [14]:
# df_rerun_merge_present.reset_index(drop=True)

In [15]:
df_merge_present = df_merge_present.iloc[(df_merge_present.set_index("#OTU ID").max(axis=1) > 1000).values]

In [16]:
df_merge_present

  and should_run_async(code)


Unnamed: 0,#OTU ID,MCT.f.0556.S358.R1.001.fa,MCT.f.0437.S306.R1.001.fa,MCT.f.0108.S291.R1.001.fa,MCT.f.0565.S67.R1.001.fa,MCT.f.0336.S208.R1.001.fa,MCT.f.0334.S316.R1.001.fa,MCT.f.0434.S477.R1.001.fa,MCT.f.0057.S371.R1.001.fa,MCT.f.0154.S465.R1.001.fa,...,MCT.f.0547.S481.R1.001.fa,MCT.f.0035.S110.R1.001.fa,MCT.f.0470.S483.R1.001.fa,Blank.B12.040.S408.R1.001.fa,MCT.f.0404.S475.R1.001.fa,MCT.f.0616.S278.R1.001.fa,MCT.f.0382.S441.R1.001.fa,MCT.f.0050.S372.R1.001.fa,MCT.f.0495.S202.R1.001.fa,MCT.f.0187.S438.R1.001.fa
0,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,121.082,135215,3.06536,101.157,56.7092,88.8955,7896.37,44.4478,338.723,...,93.4936,26.0556,45.9804,13.7941,171.66,3.06536,64.3726,9.19609,3.06536,0
1,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,55.0442,70.978,7.24265,1.44853,1.44853,2.89706,7.24265,4.34559,10.1397,...,14.4853,1.44853,1.44853,0,0,0,7.24265,0,0,0
2,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,3040.06,849.982,715.703,1483.78,695.562,1333.38,388.064,1674.45,774.786,...,919.806,333.01,6.71392,44.3118,73.8531,87.2809,45.6546,95.3376,322.268,0
3,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,4365.85,1492.2,1.38423,1741.36,1533.72,3461.95,71.9798,885.905,8.30536,...,2551.13,539.849,1054.78,31.8372,319.756,37.3741,0,292.072,0,0
5,k__Bacteria;p__Actinobacteriota;c__Coriobacter...,0,0,0,0,0,0,0,0,7.10029,...,0,0,0,5.32522,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,k__Bacteria;p__Verrucomicrobiota;c__Lentisphae...,0,0,0,0,1152.77,723.917,0,0,0,...,0,0,0.610901,0,0,0,0.610901,0,0,1.2218
289,k__Bacteria;p__Verrucomicrobiota;c__Lentisphae...,0,0,0,0,76.379,40.0374,0,0,0.61596,...,0,0,89.9301,0,0,0,37.5735,0,0,57.2843
290,k__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
291,k__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,1364.49,57.7051,48.0876,12506.4,733.336,4995.1,7.21314,0,67.3227,...,573.445,0,4.80876,33.6613,0,56.503,195.957,0,1471.48,0


In [17]:
np.array([value.startswith("k__Arch") for value in df_merge_present["#OTU ID"].values]).sum()

  and should_run_async(code)


0

In [18]:
df_merge_present_otu = df_merge_present.copy()
df_merge_present_otu["#OTU ID"] = df_merge_present_otu["#OTU ID"].map(reverse_taxamap)

In [19]:
df_merge_present.shape

(269, 439)

In [20]:
df_merge_present.to_csv("../data/species.gtdb98.txt", index=False, sep="\t")
df_merge_present_otu.to_csv("../data/otu.gtdb98.txt", index=False, sep="\t")