In [1]:
import pandas as pd
import numpy as np

In [2]:
samples = ['insitu_6_2','insitu_8_1','insitu_9_1','insitu_10_1','insitu_11_1','insitu_12_1',
           'onboard_1_2','onboard_2_2','onboard_7_1','onboard_8_1','onboard_9_1','onboard_10_1','onboard_11_1']

newcol = ['insitu1','insitu2','insitu3','insitu4','insitu5','insitu6', 
          'onboard1','onboard2','onboard3','onboard4','onboard5','onboard6','onboard7']

In [3]:
def Calculate_abundance_and_count():
    
    dfcount=pd.DataFrame()
    dfabundance=pd.DataFrame()
    
    for i in range(len(samples)):
        file = 'Data/' + samples[i] + '_chosen'
        dfsample = pd.read_csv(file, header = 0, usecols=[1,2], sep="\t")
        dfsample.set_index('subject_id', inplace=True)
        
        dfsample['count'] = 1

        count = dfsample.groupby('subject_id').sum() # count
        abundance = dfsample.groupby('subject_id').sum()
        abundance['abundance'] = abundance['count'].apply(lambda x: x / len(dfsample))    # relative abundance

        dfcount = pd.concat([dfcount, count['count']], axis=1, sort=False, join='outer')
        dfabundance = pd.concat([dfabundance, abundance['abundance']], axis=1, sort=False)
            
    dfcount.columns = newcol
    dfabundance.columns = newcol

    return dfcount, dfabundance


In [4]:
dfcount, dfabundance = Calculate_abundance_and_count()
print(len(dfcount))
print(len(dfabundance))

1198
1198


In [5]:
def Add_taxonomy(df):
    
    taxmap = pd.read_csv("/nfs_share/motoki/metatra/slv_DB/taxmap_slv_ssu_123.txt", header=None, sep='\t')
    
    taxmap["subject_id"] = taxmap[0] + '.' + taxmap[1].astype(str) + '.' + taxmap[2].astype(str)
    taxmap = taxmap.set_index(["subject_id"])
    taxmap.columns = ["id", "s_start", "s_end", "taxonomy", "taxonomy2"]
    taxmap.drop(["id","s_start", "s_end"], axis=1, inplace=True)
    
    dftaxonomy = pd.concat([df, taxmap], axis=1, join_axes=[df.index])
    
    return dftaxonomy

In [6]:
dfcount_taxa = Add_taxonomy(dfcount)
dfabundance_taxa = Add_taxonomy(dfabundance)

In [7]:
dfcount_taxa.to_csv('Data/count', sep='\t')
dfabundance_taxa.to_csv('Data/abundance', sep='\t')

In [8]:
dfcount_taxa.head()

Unnamed: 0,insitu1,insitu2,insitu3,insitu4,insitu5,insitu6,onboard1,onboard2,onboard3,onboard4,onboard5,onboard6,onboard7,taxonomy,taxonomy2
AB239761.1.1412,4.0,3.0,1.0,3.0,4.0,1.0,3.0,8.0,4.0,4.0,12.0,,3.0,Bacteria;Proteobacteria;Gammaproteobacteria;Th...,uncultured bacterium
AB239762.1.1382,2.0,1.0,4.0,4.0,1.0,2.0,,,2.0,1.0,1.0,,2.0,Bacteria;Bacteroidetes;Flavobacteriia;Flavobac...,uncultured bacterium
AB278150.1.1475,1.0,,3.0,2.0,1.0,,1.0,,,,5.0,1.0,1.0,Bacteria;Proteobacteria;Epsilonproteobacteria;...,uncultured bacterium
AB440165.1.1431,29.0,169.0,191.0,190.0,103.0,165.0,28.0,27.0,49.0,41.0,80.0,16.0,27.0,Bacteria;Proteobacteria;Epsilonproteobacteria;...,uncultured bacterium
AB440166.1.1430,1.0,2.0,4.0,2.0,2.0,2.0,2.0,3.0,,4.0,,,1.0,Bacteria;Proteobacteria;Epsilonproteobacteria;...,uncultured bacterium


In [9]:
dfabundance_taxa.head()

Unnamed: 0,insitu1,insitu2,insitu3,insitu4,insitu5,insitu6,onboard1,onboard2,onboard3,onboard4,onboard5,onboard6,onboard7,taxonomy,taxonomy2
AB239761.1.1412,0.000519,0.000109,3.9e-05,7.8e-05,0.000148,3.9e-05,0.000183,0.000523,0.000105,0.000169,0.000203,,0.000178,Bacteria;Proteobacteria;Gammaproteobacteria;Th...,uncultured bacterium
AB239762.1.1382,0.000259,3.6e-05,0.000157,0.000105,3.7e-05,7.9e-05,,,5.2e-05,4.2e-05,1.7e-05,,0.000118,Bacteria;Bacteroidetes;Flavobacteriia;Flavobac...,uncultured bacterium
AB278150.1.1475,0.00013,,0.000118,5.2e-05,3.7e-05,,6.1e-05,,,,8.5e-05,6.2e-05,5.9e-05,Bacteria;Proteobacteria;Epsilonproteobacteria;...,uncultured bacterium
AB440165.1.1431,0.003759,0.006115,0.007495,0.004965,0.003813,0.00649,0.00171,0.001764,0.001281,0.001735,0.001356,0.000997,0.001598,Bacteria;Proteobacteria;Epsilonproteobacteria;...,uncultured bacterium
AB440166.1.1430,0.00013,7.2e-05,0.000157,5.2e-05,7.4e-05,7.9e-05,0.000122,0.000196,,0.000169,,,5.9e-05,Bacteria;Proteobacteria;Epsilonproteobacteria;...,uncultured bacterium
