In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy   1.19.5
seaborn 0.11.1
pandas  1.1.4



In [2]:
import csv

def get_taxamap(inf_path) -> dict:
    taxa_map = dict()
    with open(inf_path) as inf:
        csv_reader = csv.reader(inf, delimiter="\t")
        for row in csv_reader:
            taxa_map[row[0]] = row[1]
    return taxa_map

  and should_run_async(code)


In [3]:
taxamap = get_taxamap("/mnt/btrfs/data/shogun/gtdb_95/r95.gtdb.tax")
reverse_taxamap = dict(zip(taxamap.values(), taxamap.keys()))

In [4]:
df = pd.read_csv("../data/taxonomy_s.GTDB98.txt", sep="\t")
df_clr = pd.read_csv("../data/taxonomy_clr_s.GTDB98.txt", sep="\t")

In [5]:
df_clr

Unnamed: 0,#taxonomy,MCT.f.0013,MCT.f.0008,MCT.f.0012,MCT.f.0011,MCT.f.0009,MCT.f.0002,MCT.f.0007,MCT.f.0010,MCT.f.0016,...,MCT.f.0627,MCT.f.0622,MCT.f.0614,MCT.f.0620,MCT.f.0624,MCT.f.0625,MCT.f.0613,MCT.f.0623,MCT.f.0621,MCT.f.0616
0,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,6.043717,5.903111,6.088174,6.679279,6.237046,5.969751,6.408245,7.817042,7.462695,...,6.935302,7.438767,6.761265,6.692167,6.875378,6.270205,6.485274,7.240003,8.997453,7.128707
1,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,-0.371380,0.131670,-0.915346,0.132853,0.608912,0.130356,-0.442234,1.081536,0.240920,...,4.279896,4.689895,3.928051,4.430404,3.815107,5.773768,3.227178,4.637313,6.337194,-6.761317
2,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,3.508120,4.120654,4.296869,4.689358,4.561528,4.124617,4.515937,5.593241,4.640500,...,9.465130,9.839567,9.702603,8.954364,8.132595,10.315542,9.373507,9.558510,10.630389,10.482113
3,k__Bacteria;p__Actinobacteriota;c__Actinomycet...,3.094356,3.239614,3.503494,3.945055,3.442125,3.701899,3.371808,4.739163,4.214352,...,9.376238,9.816089,8.925264,9.212883,9.047997,10.481366,8.784006,9.798282,11.421079,9.635592
4,k__Bacteria;p__Actinobacteriota;c__Coriobacter...,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,k__Bacteria;p__Proteobacteria;c__Gammaproteoba...,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
264,k__Bacteria;p__Verrucomicrobiota;c__Lentisphae...,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
265,k__Bacteria;p__Verrucomicrobiota;c__Lentisphae...,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
266,k__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,-0.594524,0.131670,0.588731,1.519147,-0.084236,0.749395,-12.722820,-0.384801,-0.452227,...,8.398933,8.248808,9.052015,8.019463,9.618232,8.505349,8.776254,9.182379,10.678398,10.048278


In [6]:
df["#taxonomy"] = df["#taxonomy"].map(reverse_taxamap)
df = df.rename({"#taxonomy": "#OTU ID"}, axis=1)

df_clr["#taxonomy"] = df_clr["#taxonomy"].map(reverse_taxamap)
df_clr = df_clr.rename({"#taxonomy": "#OTU ID"}, axis=1)

  and should_run_async(code)


In [7]:
df.to_csv("../data/otu.GTDB98.txt", sep="\t", index=False)
df_clr.to_csv("../data/otu.GTDB98.clr.txt", sep="\t", index=False)

In [8]:
df_clr

Unnamed: 0,#OTU ID,MCT.f.0013,MCT.f.0008,MCT.f.0012,MCT.f.0011,MCT.f.0009,MCT.f.0002,MCT.f.0007,MCT.f.0010,MCT.f.0016,...,MCT.f.0627,MCT.f.0622,MCT.f.0614,MCT.f.0620,MCT.f.0624,MCT.f.0625,MCT.f.0613,MCT.f.0623,MCT.f.0621,MCT.f.0616
0,GCF_000010425.1,6.043717,5.903111,6.088174,6.679279,6.237046,5.969751,6.408245,7.817042,7.462695,...,6.935302,7.438767,6.761265,6.692167,6.875378,6.270205,6.485274,7.240003,8.997453,7.128707
1,GCF_001025135.1,-0.371380,0.131670,-0.915346,0.132853,0.608912,0.130356,-0.442234,1.081536,0.240920,...,4.279896,4.689895,3.928051,4.430404,3.815107,5.773768,3.227178,4.637313,6.337194,-6.761317
2,GCF_000196555.1,3.508120,4.120654,4.296869,4.689358,4.561528,4.124617,4.515937,5.593241,4.640500,...,9.465130,9.839567,9.702603,8.954364,8.132595,10.315542,9.373507,9.558510,10.630389,10.482113
3,GCF_001025215.1,3.094356,3.239614,3.503494,3.945055,3.442125,3.701899,3.371808,4.739163,4.214352,...,9.376238,9.816089,8.925264,9.212883,9.047997,10.481366,8.784006,9.798282,11.421079,9.635592
4,GCF_000156175.1,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,GCF_900200035.1,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
264,GCF_002998355.1,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
265,GCF_003096415.1,-13.162792,-13.129745,-12.790467,-12.435416,-13.212120,-12.997528,-12.722820,-12.665387,-13.425960,...,-10.160175,-8.571521,-7.947070,-8.361008,-8.465479,-8.047263,-8.647943,-8.336420,-7.147365,-6.761317
266,GCF_000020225.1,-0.594524,0.131670,0.588731,1.519147,-0.084236,0.749395,-12.722820,-0.384801,-0.452227,...,8.398933,8.248808,9.052015,8.019463,9.618232,8.505349,8.776254,9.182379,10.678398,10.048278


In [9]:
df = df.set_index("#OTU ID")


df.apply(lambda x: x / x.sum(), axis=1)

  and should_run_async(code)


Unnamed: 0_level_0,MCT.f.0013,MCT.f.0008,MCT.f.0012,MCT.f.0011,MCT.f.0009,MCT.f.0002,MCT.f.0007,MCT.f.0010,MCT.f.0016,MCT.f.0003,...,MCT.f.0627,MCT.f.0622,MCT.f.0614,MCT.f.0620,MCT.f.0624,MCT.f.0625,MCT.f.0613,MCT.f.0623,MCT.f.0621,MCT.f.0616
#OTU ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_000010425.1,0.004218,0.003546,0.003039,0.003848,0.005376,0.003321,0.003911,0.015109,0.022680,0.034786,...,0.000511,0.000173,0.000047,0.000066,0.000088,0.000032,0.000072,0.000112,0.000197,0.000021
GCF_001025135.1,0.000051,0.000082,0.000021,0.000041,0.000144,0.000072,0.000031,0.000133,0.000123,0.000246,...,0.000267,0.000082,0.000021,0.000051,0.000031,0.000144,0.000021,0.000062,0.000103,0.000000
GCF_000196555.1,0.000441,0.000788,0.000669,0.000695,0.001329,0.000693,0.000779,0.002159,0.001782,0.006031,...,0.008469,0.002515,0.001174,0.000841,0.000410,0.002396,0.001703,0.001501,0.001335,0.000782
GCF_001025215.1,0.000310,0.000347,0.000322,0.000351,0.000462,0.000483,0.000264,0.000978,0.001238,0.003731,...,0.008244,0.002614,0.000574,0.001158,0.001090,0.003009,0.001005,0.002030,0.003132,0.000357
GCF_000156175.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCF_900200035.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
GCF_002998355.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
GCF_003096415.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
GCF_000020225.1,0.000012,0.000025,0.000028,0.000050,0.000022,0.000040,0.000000,0.000009,0.000019,0.000062,...,0.004979,0.000875,0.001046,0.000564,0.003095,0.000669,0.001600,0.001759,0.002391,0.000866
