In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy   1.18.4
pandas  1.0.4
seaborn 0.10.1



In [2]:
ko_table = "data/genefamilies.ko.cpm.tsv"
tax_table = "data/tax.capitalist.98.tsv"

In [3]:
df_ko = pd.read_csv(ko_table, header=0, sep="\t", index_col=0)

In [4]:
df_tax = pd.read_csv(tax_table, header=0, sep="\t", index_col=0)

In [5]:
df_ko.columns

Index(['ASTER.02.S34.001.fa_Abundance-CPM', 'BIRO.74.S19.001.fa_Abundance-CPM',
       'BODA.52.S9.001.fa_Abundance-CPM', 'BUMA.05.S4.001.fa_Abundance-CPM',
       'CAAU.07.S2.001.fa_Abundance-CPM', 'CAPMA.83.S12.001.fa_Abundance-CPM',
       'CHMA.61.S3.001.fa_Abundance-CPM', 'COCL.11.S7.001.fa_Abundance-CPM',
       'DEFI.14.S6.001.fa_Abundance-CPM', 'DEIS.70.S26.001.fa_Abundance-CPM',
       'DETH.41.S27.001.fa_Abundance-CPM', 'DIAL.15.S23.001.fa_Abundance-CPM',
       'DIGMA.71.S21.001.fa_Abundance-CPM', 'DOEC.81.S30.001.fa_Abundance-CPM',
       'EKJO.64.S32.001.fa_Abundance-CPM', 'FAFE.16.S10.001.fa_Abundance-CPM',
       'FOGI.63.S13.001.fa_Abundance-CPM', 'GIFI.21.S22.001.fa_Abundance-CPM',
       'LAES.91.S29.001.fa_Abundance-CPM', 'MEEN.93.S11.001.fa_Abundance-CPM',
       'MERI.28.S20.001.fa_Abundance-CPM', 'MOEM.48.S16.001.fa_Abundance-CPM',
       'MUNA.98.S28.001.fa_Abundance-CPM', 'OPSE.53.S5.001.fa_Abundance-CPM',
       'PAMA.46.S18.001.fa_Abundance-CPM', 'PRPI.42.S31.

In [6]:
kos = df_ko.index

In [7]:
from collections import defaultdict

ungrouped_mask = np.array([not "|" in _ for _ in kos])
k_mask = np.array([_.startswith("K") for _ in kos])

In [8]:
df_ko = df_ko.loc[ungrouped_mask & k_mask]

df_ko.index 

Index(['K00001', 'K00002', 'K00003', 'K00004', 'K00005', 'K00006', 'K00008',
       'K00009', 'K00010', 'K00012',
       ...
       'K19576', 'K19577', 'K19585', 'K19587', 'K19589', 'K19591', 'K19609',
       'K19610', 'K19611', 'K19648'],
      dtype='object', name='# Gene Family', length=7220)

In [9]:
df_ko = df_ko.sort_index(axis=1)
df_tax = df_tax.sort_index(axis=1)

In [10]:
# we only want species present in greater than 95% of the samples
df_tax = df_tax.T
mask_low_prevalence = ((df_tax > 0).sum(axis=0)) / df_tax.shape[0] >= .8


# mask low abundance
mask_low_abundance = (df_tax.median(axis=0) / df_tax.median().sum()) >= .0001

df_tax = df_tax.loc[:, mask_low_prevalence & mask_low_abundance].T

In [11]:
df_tax.shape

(355, 35)

In [12]:
# we only want species present in greater than 95% of the samples
df_ko = df_ko.T
mask_low_prevalence = ((df_ko > 0).sum(axis=0)) / df_ko.shape[0] >= .8


# mask low abundance
mask_low_abundance = (df_ko.median(axis=0) / df_ko.median().sum()) >= .0001

df_ko = df_ko.loc[:, mask_low_prevalence & mask_low_abundance].T

In [13]:
df_ko.shape

(1325, 35)

In [None]:
from scipy.stats import spearmanr

rows = []

for ko in df_ko.iterrows():
    rows.append((ko[0], df_tax.apply(lambda x: spearmanr(x, ko[1]), axis=1)))

In [15]:
len(rows)

149