Data from Holman et al 2021

[Animals, protists and bacteria share marine biogeographic patterns](https://www.nature.com/articles/s41559-021-01439-7#Sec8)

[Data from Zenodo](https://zenodo.org/records/4564075)

![image.png](attachment:image.png)

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import glob
import ipyparallel as ipp
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import shutil

from collections import Counter
from itertools import combinations
from scipy.stats import entropy

import sys
sys.path.append('/home/isaac/proj/IMEMEBA-BCI')
import BCI
import Project

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)


# Read the asv table

First column of the asv table should be the name of the asvs which should match exactly the names of the
asvs in the fasta file

In [43]:
asv_table = "./_zenodo_data/rawdata/COI_DADA.csv"
# Removing unused samples from the asv table
cleaned_asv_table = "./asv_table-cleaned.csv"
fasta_data = "./_zenodo_data/rawdata/DADA2.COI.OTUs.gz"
# We will create this next file after loading the asv_table
sitemap = './sitemap.txt'

metadata = pd.read_csv(asv_table, index_col=0)
metadata

Unnamed: 0_level_0,L.AQ1,L.BR1,L.BR2,L.BR3,L.C1,L.C2,L.C3,L.C4,L.C5,L.C6,L.C7,L.C8,L.C9,L.CN1,L.CN2,L.CN3,L.DC1,L.DC2,L.DC3,L.DU1,L.DU2,L.DU3,L.EL1,L.EL2,L.EL3,L.HB1,L.HB2,L.HB3,L.HN1,L.HN2,L.HN3,L.KN1,L.KN2,L.KN3,L.MB1,L.MB2,L.MB3,L.MN1,L.MN2,L.MN3,L.NK1,L.NK2,L.NK3,L.PA1,L.PA2,L.PA3,L.PCR1,L.PE1,L.PE2,L.PE3,L.RB1,L.RB2,L.RB3,L.RN1,L.RN2,L.RN3,L.SM1,L.SM2,L.SM3,L.SN1,L.SN2,L.SN3,L.SY1,L.SY2,L.SY3,L.TB1,L.TB2,L.TB3,Unnamed: 69
OTUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1
OTU_1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10557,13553,21689,22858,40972,30060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,37929,47834,22757,25940,27125,24402,17105,25596,36467,0,8,0,
OTU_10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14619,18671,29087,7817,14340,10318,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,140,40,
OTU_100,0,121,110,100,0,0,0,0,0,0,0,0,0,607,963,736,0,0,0,0,0,0,0,0,0,61,51,140,522,940,616,209,278,176,345,212,253,497,419,443,512,843,890,64,101,91,0,52,92,61,0,0,0,0,0,0,39,56,27,68,75,52,14,17,34,331,1163,201,
OTU_1000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,5,0,0,14,9,270,212,232,122,107,103,21,38,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
OTU_10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU_9995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,3,4,0,
OTU_9996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,
OTU_9997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
OTU_9998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,9,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,


In [44]:
# Remove control, other, and unidentified sites
drop_cols = ["L.C1", "L.C2", "L.C3", "L.C4", "L.C5", "L.C6", "L.C7", "L.C8", "L.C9",
            "L.PCR1", "Unnamed: 69", "L.DC1", "L.DC2", "L.DC3", "L.AQ1"]
metadata = metadata.drop(columns=drop_cols)
metadata.to_csv(cleaned_asv_table)
metadata

Unnamed: 0_level_0,L.BR1,L.BR2,L.BR3,L.CN1,L.CN2,L.CN3,L.DU1,L.DU2,L.DU3,L.EL1,L.EL2,L.EL3,L.HB1,L.HB2,L.HB3,L.HN1,L.HN2,L.HN3,L.KN1,L.KN2,L.KN3,L.MB1,L.MB2,L.MB3,L.MN1,L.MN2,L.MN3,L.NK1,L.NK2,L.NK3,L.PA1,L.PA2,L.PA3,L.PE1,L.PE2,L.PE3,L.RB1,L.RB2,L.RB3,L.RN1,L.RN2,L.RN3,L.SM1,L.SM2,L.SM3,L.SN1,L.SN2,L.SN3,L.SY1,L.SY2,L.SY3,L.TB1,L.TB2,L.TB3
OTUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
OTU_1,0,0,0,0,0,0,0,0,0,0,0,0,10557,13553,21689,22858,40972,30060,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,37929,47834,22757,25940,27125,24402,17105,25596,36467,0,8,0
OTU_10,0,0,0,0,0,0,0,0,0,0,0,0,14619,18671,29087,7817,14340,10318,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,140,40
OTU_100,121,110,100,607,963,736,0,0,0,0,0,0,61,51,140,522,940,616,209,278,176,345,212,253,497,419,443,512,843,890,64,101,91,52,92,61,0,0,0,0,0,0,39,56,27,68,75,52,14,17,34,331,1163,201
OTU_1000,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,5,0,0,14,9,270,212,232,122,107,103,21,38,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_10000,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU_9995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,3,4,0
OTU_9996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0
OTU_9997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
OTU_9998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,9,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Create sitemap file

Sitemap file is formatted like this:

    sample1    site1
    sample2    site1
    sample3    site2
    sample4    site2
    ...
    samplen    siten


In [42]:
# In the metadata remove the 'L.' and trailing integer to isolate site codes
sites = list(set([x[2:-1] for x in metadata.columns]))
print(len(sites), sorted(sites))

with open(sitemap, 'w') as outfile:
    outfile.write("#sample\tsite\n")
    for site in sites:
        sids = [x for x in metadata.columns if site in x]
        for sid in sids:
            outfile.write(f"{sid}\t{site}\n")


18 ['BR', 'CN', 'DU', 'EL', 'HB', 'HN', 'KN', 'MB', 'MN', 'NK', 'PA', 'PE', 'RB', 'RN', 'SM', 'SN', 'SY', 'TB']


## Load the data into a BCI project

In [49]:
pjt = Project.Project(cleaned_asv_table, fasta_data, sitemap)


In [None]:
pjt.run(verbose=True)

L.BR1
[906, 856, 822, 786, 755, 741, 726, 713, 698, 687, 669, 647, 630, 604, 578, 546, 497, 469, 438, 409, 372, 339, 299, 264, 235, 196, 173, 141, 117, 102]
L.BR2
[759, 714, 684, 655, 633, 616, 607, 596, 584, 576, 559, 543, 528, 506, 484, 459, 424, 402, 370, 342, 317, 293, 258, 228, 206, 177, 155, 132, 110, 96]
L.BR3
[764, 721, 691, 662, 635, 619, 608, 598, 586, 575, 560, 543, 528, 504, 485, 454, 413, 390, 363, 330, 306, 271, 249, 221, 187, 155, 140, 118, 98, 86]
L.CN1
[2271, 2205, 2158, 2127, 2094, 2065, 2040, 2015, 1969, 1928, 1877, 1819, 1754, 1660, 1569, 1464, 1340, 1224, 1118, 1011, 889, 788, 708, 613, 494, 437, 375, 313, 270, 213]
L.CN2
[2585, 2494, 2431, 2392, 2353, 2320, 2276, 2245, 2201, 2158, 2107, 2033, 1957, 1878, 1775, 1671, 1518, 1391, 1245, 1138, 1003, 887, 784, 674, 560, 472, 406, 354, 291, 236]
L.CN3
