## Loading and parsing the Cyprus data OTU table

This contains ASVs clustered to OTUs from 1-10%.

The 3% clustering level is almost identical to what I got from vsearch.


In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import iBioGen

from collections import Counter
from itertools import combinations
from IPython.display import display

pd.set_option('display.max_columns', None)

Acari_fasta_file = "./CyprusMetabarcoding/Acari_237ASVs_sequences_44locations.fasta"
Acari_ASVs = "./CyprusMetabarcoding/Acari_237ASVs_table_44locations.txt"
Acari_OTUs = "./CyprusMetabarcoding/Acari_mapping_44samples_withVouchers_237ASVs.txt"
Coleo_fasta_file = "./CyprusMetabarcoding/Coleoptera_353ASVs_sequences_44locations.fasta"
Coleo_ASVs = "./CyprusMetabarcoding/Coleoptera_353ASVs_table_44locations.txt"
Coleo_OTUs = "./CyprusMetabarcoding/Coleoptera_mapping_44samples_withVouchers_353ASVs.txt"
Collem_fasta_file = "./CyprusMetabarcoding/Collembola_317ASVs_sequences_44locations.fasta"
Collem_ASVs = "./CyprusMetabarcoding/Collembola_317ASVs_table_44locations.txt"
Collem_OTUs = "./CyprusMetabarcoding/Collembola_mapping_44samples_withVouchers_317ASVs.txt"


In [5]:
ac_map = pd.read_csv(Acari_OTUs, sep="\t")
cp_map = pd.read_csv(Coleo_OTUs, sep="\t")
cb_map = pd.read_csv(Collem_OTUs, sep="\t")

col_map

Unnamed: 0,ASV,size,ASV_size,lineage1,lineage2,lineage3,lineage4,lineage5,lineage6,lineage7,lineage8,lineage9,lineage10
0,ASVCb001ptp001,,ASVCb001ptp001,Cb1,Cb1,Cb1,Cb1,Cb1,Cb1,Cb1,Cb1,Cb1,Cb1
1,ASVCb008ptp004,,ASVCb008ptp004,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2
2,ASVcb1304,size=174,ASVcb1304;size=174,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2
3,ASVcb1315,size=172,ASVcb1315;size=172,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2
4,ASVcb1338,size=166,ASVcb1338;size=166,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2,Cb2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,ASVcb811,size=364,ASVcb811;size=364,Cb161,Cb112,Cb98,Cb82,Cb75,Cb72,Cb68,Cb62,Cb58,Cb56
313,ASVcb867,size=331,ASVcb867;size=331,Cb163,Cb114,Cb6,Cb6,Cb6,Cb6,Cb6,Cb3,Cb3,Cb3
314,ASVcb895,size=319,ASVcb895;size=319,Cb164,Cb115,Cb100,Cb84,Cb77,Cb74,Cb70,Cb64,Cb60,Cb58
315,ASVcb939,size=294,ASVcb939;size=294,Cb165,Cb116,Cb101,Cb85,Cb78,Cb75,Cb71,Cb65,Cb61,Cb59


### My vsearch 3% clustering and Victor's 3% clustering are basically identical

vsearch:

    140
    154
    93

In [6]:
for otumap in [ac_map, cp_map, cb_map]:
    print(len(set(otumap["lineage3"])))


139
154
93


## Numbers of OTUs at different clustering thresholds

In [11]:
for otumap in [ac_map, cp_map, cb_map]:
    print(len(otumap), end=": ")
    for i in range(1, 11):
        print(len(set(otumap[f"lineage{i}"])), end="  ")
    print("")

237: 174  153  139  132  129  128  126  125  121  119  
353: 185  160  154  150  146  145  141  141  139  138  
317: 153  105  93  79  72  69  66  62  58  57  
