Calculate the number of cells per ml for the November 2017, 2018, and 2019 timepoints using the number of reads recruited to the GENs CDS fraction, estimate the number of reads that would be expected based on the CDS fraction, calculate abundance, normalize abundance by genome size, and recalculate the abundance. Finally multiply by the total number of cells per ml to get the number of cells per ml for each GEN

### Inputs
/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/Summary_files/All_GoM_SAGs_1cell_20kb_decon_531normalized_predresp_rate_GTDBclass.csv
/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GORG_recruitment/summaries/ALL_*_contf_pe_bbmerge_reads_annotated_reads_by_gen_and_ko.csv
/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/Summary_files/GTDB_GoM_SAG_classification_key.csv
### Outputs
/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GoM_Metagenome_analysis/DNA_metagenome_calculated_Genus_abundance.csv
/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GoM_Metagenome_analysis/GoM_metagenome_calculated_Genus_cells_per_ml.csv


In [80]:
import pandas as pd
from pandas import DataFrame
import os
import sys
import csv
from pathlib import Path
import seaborn as sns
sns.set()
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.show()
from collections import defaultdict
import os.path as op
import glob

os.chdir('/mnt/scgc/simon/microg2p/analyses/GORG_recruitment/210121_summary_tables')
matplotlib.__version__

'3.3.3'

#### I need to think a liuttle bit about figuring out how to adjust for the number of reads that are shared among multiple genera. Right now I am thinking that the best method is goint to be to:
1. calculate the total number of reads that have been assigned 
2. calculate the number of reclusive reads
3. calculate the proportion of reads that were exclusivelly assigned
4. Use that proportion of reads to adjust down the "total" number of reads per metagenome.

In [81]:
# create table of average genome size per GEN
fields=['name', 'final_assembly_length', 'checkM_estimated_completeness', 'GTDB_classification', 
        'hypothetical_cds_fraction', 'total_CDS_count', 'average_cds_length']
SAG_data=pd.read_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/Summary_files/All_GoM_SAGs_1cell_20kb_decon_531normalized_predresp_rate_GTDBclass.csv', 
                     usecols=fields)
SAG_data['CDS_length']=(SAG_data['total_CDS_count']*SAG_data['average_cds_length'])

SAG_data['GTDB_classification']=SAG_data['GTDB_classification'].fillna('Unclassified')
SAG_data=SAG_data.rename(columns={'GTDB_classification':'genus'})

SAG_data['CDS_fraction']=SAG_data['CDS_length']/SAG_data['final_assembly_length']
SAG_data['checkM_estimated_completeness']=SAG_data['checkM_estimated_completeness'].replace(0, np.nan)
SAG_data=SAG_data[SAG_data['checkM_estimated_completeness'].notna()]
#SAG_data=SAG_data[SAG_data['group72.5'].notna()]
SAG_data['Estimated_genome_size']=SAG_data['final_assembly_length']*(100/SAG_data['checkM_estimated_completeness'])
SAG_data['Estimated_total_CDS_len']=SAG_data['CDS_fraction']*SAG_data['Estimated_genome_size']
SAG_data['Cell_count']=1

SAG_data.head(6)

Unnamed: 0,name,final_assembly_length,checkM_estimated_completeness,total_CDS_count,hypothetical_cds_fraction,average_cds_length,genus,CDS_length,CDS_fraction,Estimated_genome_size,Estimated_total_CDS_len,Cell_count
0,AH-135-A01,378833,22.64,390.0,0.241026,865.892308,Pelagibacter,337698.0,0.891417,1673291.0,1491599.0,1
1,AH-135-A02,1800737,86.23,1622.0,0.332306,1016.482737,Hel1-33-131,1648735.0,0.915589,2088295.0,1912020.0,1
2,AH-135-A03,856845,74.13,919.0,0.228509,863.133841,IMCC9063,793220.0,0.925745,1155868.0,1070039.0,1
3,AH-135-A04,1481697,83.69,1467.0,0.194274,925.822086,Thioglobus,1358181.0,0.916639,1770459.0,1622871.0,1
4,AH-135-A05,382026,19.18,400.0,0.275,840.0125,SCGC-AAA076-P13,336005.0,0.879534,1991794.0,1751851.0,1
5,AH-135-A06,1127973,48.49,1113.0,0.253369,919.765499,Unclassified,1023699.0,0.907556,2326197.0,2111155.0,1


In [82]:

CDS_frac=pd.pivot_table(SAG_data, values='CDS_fraction', 
                        columns='genus', aggfunc=np.mean)
genome_size=pd.pivot_table(SAG_data, values='Estimated_genome_size', aggfunc=np.mean, columns='genus')


CDS_frac=CDS_frac.append(genome_size)
CDS_frac.loc['CDS_Length_per_cell']=CDS_frac.loc['Estimated_genome_size']*CDS_frac.loc['CDS_fraction']

CDS_frac

#There are 7 GENs that only have SAGs with an estimated completeness of 0. I want to try and double check this
#But all I do is drop rows where estimated completeness is 0 and I lose 7 GENs. 
#GENs 149, 144

Unnamed: 0,2-01-FULL-41-14,AAA164-E04,AAA536-G10,AG-337-I02,AG-339-G14,AG-414-E02,AG-422-B15,ASP10-02a,Akkermansiaceae,Algibacter_B,...,Vicingaceae,Winogradskyella,Yoonia,Crocinitomix,HIMB11,Psychromonas,Sulfurimonas,Sulfurospirillum_A,UBA4311,ZODW24
CDS_fraction,0.9033395,0.8584001,0.8961937,0.9100622,0.9219721,0.9180559,0.9167689,0.8663398,0.8698882,0.8618457,...,0.8814981,0.8688375,0.8583902,,,,,,,
Estimated_genome_size,1254385.0,5870149.0,2697551.0,2095484.0,1781418.0,1513866.0,1519782.0,3164268.0,4139108.0,4364427.0,...,2516277.0,3568247.0,4369587.0,3235959.0,3604166.0,3887524.0,2043172.0,2911366.0,1069556.0,3014321.0
CDS_Length_per_cell,1133136.0,5038937.0,2417528.0,1907021.0,1642418.0,1389814.0,1393289.0,2741331.0,3600562.0,3761462.0,...,2218093.0,3100227.0,3750811.0,,,,,,,


In [83]:
CDS_frac_Tran=CDS_frac.T

CDS_frac_Tran=CDS_frac_Tran.rename_axis('genus')
CDS_frac_Tran=CDS_frac_Tran.reset_index()
CDS_frac_Tran['genus']=CDS_frac_Tran['genus'].astype(str)
CDS_frac_Tran=CDS_frac_Tran.sort_values(by=['genus'])
CDS_frac_Tran.dtypes

genus                     object
CDS_fraction             float64
Estimated_genome_size    float64
CDS_Length_per_cell      float64
dtype: object

In [84]:
# The total number of reads recruited per metagneome is:
# this was calculated by counting the number of lines that begun with "C" in the gorg out files
recruit190709=43267551
recruit190402=46925864
recruit181030=45815574
recruit171102=39446211

In [85]:
#
DNA_df171102=pd.read_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GORG_recruitment/summaries/ALL_20171102_contf_pe_bbmerge_reads_annotated_read_count_by_genus.csv')
DNA_df171102_pivot=pd.pivot_table(DNA_df171102, index='genus', values='exclusive', aggfunc=np.sum)
DNA_df171102_pivot=DNA_df171102_pivot.rename(columns={'exclusive':'171102_exclusive_reads'})
DNA_df171102_pivot=DNA_df171102_pivot.reset_index()

#calculate the fraction of the total number of reads assigned that are assigned to only a single genus
exclusive_frac1711=DNA_df171102['exclusive'].sum()/recruit171102


abundance_df=CDS_frac_Tran.merge(DNA_df171102_pivot, on='genus', how='outer')
abundance_df

Unnamed: 0,genus,CDS_fraction,Estimated_genome_size,CDS_Length_per_cell,171102_exclusive_reads
0,2-01-FULL-41-14,0.903340,1.254385e+06,1.133136e+06,
1,AAA164-E04,0.858400,5.870149e+06,5.038937e+06,42293.0
2,AAA536-G10,0.896194,2.697551e+06,2.417528e+06,420970.0
3,AG-337-I02,0.910062,2.095484e+06,1.907021e+06,42331.0
4,AG-339-G14,0.921972,1.781418e+06,1.642418e+06,
...,...,...,...,...,...
297,Vibrio,0.881286,4.803806e+06,4.233525e+06,26254.0
298,Vicingaceae,0.881498,2.516277e+06,2.218093e+06,17297.0
299,Winogradskyella,0.868838,3.568247e+06,3.100227e+06,143214.0
300,Yoonia,0.858390,4.369587e+06,3.750811e+06,24396.0


In [86]:
DNA_df181030=pd.read_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GORG_recruitment/summaries/ALL_20181030_contf_pe_bbmerge_reads_annotated_read_count_by_genus.csv')
DNA_df181030_pivot=pd.pivot_table(DNA_df181030, index='genus', values='exclusive', aggfunc=np.sum)
DNA_df181030_pivot=DNA_df181030_pivot.rename(columns={'exclusive':'181030_exclusive_reads'})
DNA_df181030_pivot=DNA_df181030_pivot.reset_index()


exclusive_frac1810=DNA_df181030['exclusive'].sum()/recruit181030

abundance_df=abundance_df.merge(DNA_df181030_pivot, on='genus', how='outer')

In [87]:
DNA_df190402=pd.read_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GORG_recruitment/summaries/ALL_20190402_contf_pe_bbmerge_reads_annotated_read_count_by_genus.csv')
DNA_df190402_pivot=pd.pivot_table(DNA_df190402, index='genus', values='exclusive', aggfunc=np.sum)
DNA_df190402_pivot=DNA_df190402_pivot.rename(columns={'exclusive':'190402_exclusive_reads'})
DNA_df190402_pivot=DNA_df190402_pivot.reset_index()

exclusive_frac1904=DNA_df190402['exclusive'].sum()/recruit190402


abundance_df=abundance_df.merge(DNA_df190402_pivot, on='genus', how='outer')

In [88]:
DNA_df190709=pd.read_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GORG_recruitment/summaries/ALL_20190709_contf_pe_bbmerge_reads_annotated_read_count_by_genus.csv')
DNA_df190709_pivot=pd.pivot_table(DNA_df190709, index='genus', values='exclusive', aggfunc=np.sum)
DNA_df190709_pivot=DNA_df190709_pivot.rename(columns={'exclusive':'190709_exclusive_reads'})
DNA_df190709_pivot=DNA_df190709_pivot.reset_index()

exclusive_frac1907=DNA_df190709['exclusive'].sum()/recruit190709

abundance_df=abundance_df.merge(DNA_df190709_pivot, on='genus', how='outer')


abundance_df=abundance_df.dropna(subset=['171102_exclusive_reads', '181030_exclusive_reads', '190402_exclusive_reads', '190709_exclusive_reads'],axis=0, how='all')
abundance_df.tail(10)

Unnamed: 0,genus,CDS_fraction,Estimated_genome_size,CDS_Length_per_cell,171102_exclusive_reads,181030_exclusive_reads,190402_exclusive_reads,190709_exclusive_reads
288,UBA952,0.894483,3414891.0,3054563.0,1187566.0,538232.0,73407.0,515683.0
291,UBA985,0.86332,3725848.0,3216600.0,25262.0,25877.0,17817.0,54322.0
292,UBA9926,0.842537,3669067.0,3091324.0,514528.0,436862.0,211771.0,565066.0
294,Ulvibacter,0.87309,2830388.0,2471184.0,23933.0,21301.0,63241.0,97607.0
295,Unclassified,0.844824,6885372.0,5816928.0,683055.0,785201.0,738683.0,1107366.0
296,Verrucomicrobiales,0.848688,5093903.0,4323135.0,11456.0,26618.0,2745.0,7417.0
297,Vibrio,0.881286,4803806.0,4233525.0,26254.0,40702.0,26483.0,24951.0
298,Vicingaceae,0.881498,2516277.0,2218093.0,17297.0,16154.0,185896.0,68880.0
299,Winogradskyella,0.868838,3568247.0,3100227.0,143214.0,96104.0,385347.0,203049.0
300,Yoonia,0.85839,4369587.0,3750811.0,24396.0,28712.0,568691.0,27499.0


In [89]:
# 
#use the reads per cds to extrapolate the # reads per genome using the CDS_fraction
#figure out the abundance (maybe I am tired but how do i do this when I already did a normalization based on size?)
# normalize the number of reads per genome by genome size


abundance_df['extrapolated_reads_per_genome_20171102']=(abundance_df['171102_exclusive_reads']/abundance_df['CDS_fraction'])*1
abundance_df['extrapolated_reads_per_genome_20181030']=(abundance_df['181030_exclusive_reads']/abundance_df['CDS_fraction'])*1
abundance_df['extrapolated_reads_per_genome_20190402']=(abundance_df['190402_exclusive_reads']/abundance_df['CDS_fraction'])*1
abundance_df['extrapolated_reads_per_genome_20190709']=(abundance_df['190709_exclusive_reads']/abundance_df['CDS_fraction'])*1
abundance_df

# these numbers come from a seqkit stats count of the number of reads in the 
# /mnt/scgc/simon/microg2p/Data/DNA_seq/merged_DNA_seq_runs_201014_201029/*.fastq.gz files

#Adjust the total number of reads so that is proportional to the number of reads that were assigned exclusively to one genus
reads20171102=68439965*exclusive_frac1711
reads20181030=76515787*exclusive_frac1810
reads20190402=73902569*exclusive_frac1904
reads20190709=70871633*exclusive_frac1907



abundance_df.tail(10)

Unnamed: 0,genus,CDS_fraction,Estimated_genome_size,CDS_Length_per_cell,171102_exclusive_reads,181030_exclusive_reads,190402_exclusive_reads,190709_exclusive_reads,extrapolated_reads_per_genome_20171102,extrapolated_reads_per_genome_20181030,extrapolated_reads_per_genome_20190402,extrapolated_reads_per_genome_20190709
288,UBA952,0.894483,3414891.0,3054563.0,1187566.0,538232.0,73407.0,515683.0,1327656.0,601723.923003,82066.372895,576515.0
291,UBA985,0.86332,3725848.0,3216600.0,25262.0,25877.0,17817.0,54322.0,29261.45,29973.815202,20637.765794,62922.19
292,UBA9926,0.842537,3669067.0,3091324.0,514528.0,436862.0,211771.0,565066.0,610689.0,518507.833612,251349.218819,670672.1
294,Ulvibacter,0.87309,2830388.0,2471184.0,23933.0,21301.0,63241.0,97607.0,27411.84,24397.258287,72433.548253,111794.9
295,Unclassified,0.844824,6885372.0,5816928.0,683055.0,785201.0,738683.0,1107366.0,808517.4,929425.424121,874363.074634,1310765.0
296,Verrucomicrobiales,0.848688,5093903.0,4323135.0,11456.0,26618.0,2745.0,7417.0,13498.48,31363.701719,3234.403833,8739.371
297,Vibrio,0.881286,4803806.0,4233525.0,26254.0,40702.0,26483.0,24951.0,29790.57,46184.80303,30050.418619,28312.05
298,Vicingaceae,0.881498,2516277.0,2218093.0,17297.0,16154.0,185896.0,68880.0,19622.28,18325.620549,210886.440358,78139.7
299,Winogradskyella,0.868838,3568247.0,3100227.0,143214.0,96104.0,385347.0,203049.0,164834.0,110612.163474,443520.200598,233701.9
300,Yoonia,0.85839,4369587.0,3750811.0,24396.0,28712.0,568691.0,27499.0,28420.64,33448.656931,662508.71268,32035.55


In [90]:
#add the rest of the unrecruited reads to Unclassified
abundance_df.at[295,'extrapolated_reads_per_genome_20171102']=abundance_df.at[295,'extrapolated_reads_per_genome_20171102']+(reads20171102-recruit171102)
abundance_df.at[295,'extrapolated_reads_per_genome_20181030']=abundance_df.at[295,'extrapolated_reads_per_genome_20181030']+(reads20181030-recruit181030)
abundance_df.at[295,'extrapolated_reads_per_genome_20190402']=abundance_df.at[295,'extrapolated_reads_per_genome_20190402']+(reads20190402-recruit190402)
abundance_df.at[295,'extrapolated_reads_per_genome_20190709']=abundance_df.at[295,'extrapolated_reads_per_genome_20190709']+(reads20190709-recruit190709)
abundance_df.tail(10)

Unnamed: 0,genus,CDS_fraction,Estimated_genome_size,CDS_Length_per_cell,171102_exclusive_reads,181030_exclusive_reads,190402_exclusive_reads,190709_exclusive_reads,extrapolated_reads_per_genome_20171102,extrapolated_reads_per_genome_20181030,extrapolated_reads_per_genome_20190402,extrapolated_reads_per_genome_20190709
288,UBA952,0.894483,3414891.0,3054563.0,1187566.0,538232.0,73407.0,515683.0,1327656.0,601723.9,82066.37,576515.0
291,UBA985,0.86332,3725848.0,3216600.0,25262.0,25877.0,17817.0,54322.0,29261.45,29973.82,20637.77,62922.19
292,UBA9926,0.842537,3669067.0,3091324.0,514528.0,436862.0,211771.0,565066.0,610689.0,518507.8,251349.2,670672.1
294,Ulvibacter,0.87309,2830388.0,2471184.0,23933.0,21301.0,63241.0,97607.0,27411.84,24397.26,72433.55,111794.9
295,Unclassified,0.844824,6885372.0,5816928.0,683055.0,785201.0,738683.0,1107366.0,15847610.0,20754610.0,13228160.0,15111570.0
296,Verrucomicrobiales,0.848688,5093903.0,4323135.0,11456.0,26618.0,2745.0,7417.0,13498.48,31363.7,3234.404,8739.371
297,Vibrio,0.881286,4803806.0,4233525.0,26254.0,40702.0,26483.0,24951.0,29790.57,46184.8,30050.42,28312.05
298,Vicingaceae,0.881498,2516277.0,2218093.0,17297.0,16154.0,185896.0,68880.0,19622.28,18325.62,210886.4,78139.7
299,Winogradskyella,0.868838,3568247.0,3100227.0,143214.0,96104.0,385347.0,203049.0,164834.0,110612.2,443520.2,233701.9
300,Yoonia,0.85839,4369587.0,3750811.0,24396.0,28712.0,568691.0,27499.0,28420.64,33448.66,662508.7,32035.55


In [91]:
abundance_df['total_genome_size_abundance_20171102']=abundance_df['extrapolated_reads_per_genome_20171102']/reads20171102
abundance_df['genome_size_normalized_abundance_20171102']=abundance_df['total_genome_size_abundance_20171102']/abundance_df['Estimated_genome_size']
abundance_df['20171102_genome_size_normalized_metagenome_abundance']=abundance_df['genome_size_normalized_abundance_20171102']/abundance_df['genome_size_normalized_abundance_20171102'].sum()

abundance_df['total_genome_size_abundance_20181030']=abundance_df['extrapolated_reads_per_genome_20181030']/reads20181030
abundance_df['genome_size_normalized_abundance_20181030']=abundance_df['total_genome_size_abundance_20181030']/abundance_df['Estimated_genome_size']
abundance_df['20181030_genome_size_normalized_metagenome_abundance']=abundance_df['genome_size_normalized_abundance_20181030']/abundance_df['genome_size_normalized_abundance_20181030'].sum()

abundance_df['total_genome_size_abundance_20190402']=abundance_df['extrapolated_reads_per_genome_20190402']/reads20190402
abundance_df['genome_size_normalized_abundance_20190402']=abundance_df['total_genome_size_abundance_20190402']/abundance_df['Estimated_genome_size']
abundance_df['20190402_genome_size_normalized_metagenome_abundance']=abundance_df['genome_size_normalized_abundance_20190402']/abundance_df['genome_size_normalized_abundance_20190402'].sum()

abundance_df['total_genome_size_abundance_20190709']=abundance_df['extrapolated_reads_per_genome_20190709']/reads20190709
abundance_df['genome_size_normalized_abundance_20190709']=abundance_df['total_genome_size_abundance_20190709']/abundance_df['Estimated_genome_size']
abundance_df['20190709_genome_size_normalized_metagenome_abundance']=abundance_df['genome_size_normalized_abundance_20190709']/abundance_df['genome_size_normalized_abundance_20190709'].sum()

abundance_df.to_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GoM_Metagenome_analysis/DNA_metagenome_calculated_Genus_abundance.csv', index=False)
abundance_df

Unnamed: 0,genus,CDS_fraction,Estimated_genome_size,CDS_Length_per_cell,171102_exclusive_reads,181030_exclusive_reads,190402_exclusive_reads,190709_exclusive_reads,extrapolated_reads_per_genome_20171102,extrapolated_reads_per_genome_20181030,...,20171102_genome_size_normalized_metagenome_abundance,total_genome_size_abundance_20181030,genome_size_normalized_abundance_20181030,20181030_genome_size_normalized_metagenome_abundance,total_genome_size_abundance_20190402,genome_size_normalized_abundance_20190402,20190402_genome_size_normalized_metagenome_abundance,total_genome_size_abundance_20190709,genome_size_normalized_abundance_20190709,20190709_genome_size_normalized_metagenome_abundance
1,AAA164-E04,0.858400,5.870149e+06,5.038937e+06,42293.0,149588.0,21811.0,26397.0,49269.565093,174263.724566,...,0.000543,0.002655,4.522559e-10,0.001322,0.000429,7.301818e-11,0.000225,0.000539,9.179528e-11,0.000294
2,AAA536-G10,0.896194,2.697551e+06,2.417528e+06,420970.0,633456.0,408075.0,261677.0,469731.050430,706829.351929,...,0.011273,0.010768,3.991824e-09,0.011666,0.007681,2.847494e-09,0.008782,0.005116,1.896700e-09,0.006069
3,AG-337-I02,0.910062,2.095484e+06,1.907021e+06,42331.0,291185.0,24033.0,11680.0,46514.402078,319961.639674,...,0.001437,0.004874,2.326162e-09,0.006798,0.000445,2.125920e-10,0.000656,0.000225,1.073228e-10,0.000343
5,AG-414-E02,0.918056,1.513866e+06,1.389814e+06,1734.0,9446.0,6534.0,1089.0,1888.773891,10289.133897,...,0.000081,0.000157,1.035423e-10,0.000303,0.000120,7.930796e-11,0.000245,0.000021,1.373017e-11,0.000044
6,AG-422-B15,0.916769,1.519782e+06,1.393289e+06,26114.0,71506.0,223826.0,57327.0,28484.824253,77997.849546,...,0.001213,0.001188,7.818576e-10,0.002285,0.004119,2.709965e-09,0.008358,0.001096,7.209791e-10,0.002307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Verrucomicrobiales,0.848688,5.093903e+06,4.323135e+06,11456.0,26618.0,2745.0,7417.0,13498.480986,31363.701719,...,0.000172,0.000478,9.380004e-11,0.000274,0.000055,1.071119e-11,0.000033,0.000153,3.006313e-11,0.000096
297,Vibrio,0.881286,4.803806e+06,4.233525e+06,26254.0,40702.0,26483.0,24951.0,29790.570949,46184.803030,...,0.000401,0.000704,1.464671e-10,0.000428,0.000507,1.055260e-10,0.000325,0.000496,1.032739e-10,0.000330
298,Vicingaceae,0.881498,2.516277e+06,2.218093e+06,17297.0,16154.0,185896.0,68880.0,19622.276751,18325.620549,...,0.000505,0.000279,1.109498e-10,0.000324,0.003557,1.413789e-09,0.004361,0.001369,5.441492e-10,0.001741
299,Winogradskyella,0.868838,3.568247e+06,3.100227e+06,143214.0,96104.0,385347.0,203049.0,164834.037915,110612.163474,...,0.002990,0.001685,4.722526e-10,0.001380,0.007482,2.096779e-09,0.006467,0.004095,1.147657e-09,0.003672


In [92]:
abundance=['genus', '20171102_genome_size_normalized_metagenome_abundance', '20181030_genome_size_normalized_metagenome_abundance', 
           '20190402_genome_size_normalized_metagenome_abundance', '20190709_genome_size_normalized_metagenome_abundance']
df_abundance=abundance_df[abundance].copy()
df_abundance['genus']=df_abundance['genus'].astype(str)
df_abundance

Unnamed: 0,genus,20171102_genome_size_normalized_metagenome_abundance,20181030_genome_size_normalized_metagenome_abundance,20190402_genome_size_normalized_metagenome_abundance,20190709_genome_size_normalized_metagenome_abundance
1,AAA164-E04,0.000543,0.001322,0.000225,0.000294
2,AAA536-G10,0.011273,0.011666,0.008782,0.006069
3,AG-337-I02,0.001437,0.006798,0.000656,0.000343
5,AG-414-E02,0.000081,0.000303,0.000245,0.000044
6,AG-422-B15,0.001213,0.002285,0.008358,0.002307
...,...,...,...,...,...
296,Verrucomicrobiales,0.000172,0.000274,0.000033,0.000096
297,Vibrio,0.000401,0.000428,0.000325,0.000330
298,Vicingaceae,0.000505,0.000324,0.004361,0.001741
299,Winogradskyella,0.002990,0.001380,0.006467,0.003672


In [93]:
df_abundance['20171102_genome_size_normalized_metagenome_cells/ml']=df_abundance['20171102_genome_size_normalized_metagenome_abundance']*1.55E+06
df_abundance['20181030_genome_size_normalized_metagenome_cells/ml']=df_abundance['20181030_genome_size_normalized_metagenome_abundance']*1.45E+06
df_abundance['20190402_genome_size_normalized_metagenome_cells/ml']=df_abundance['20190402_genome_size_normalized_metagenome_abundance']*1.64E+06
df_abundance['20190709_genome_size_normalized_metagenome_cells/ml']=df_abundance['20190709_genome_size_normalized_metagenome_abundance']*2.92E+06

df_abundance

Unnamed: 0,genus,20171102_genome_size_normalized_metagenome_abundance,20181030_genome_size_normalized_metagenome_abundance,20190402_genome_size_normalized_metagenome_abundance,20190709_genome_size_normalized_metagenome_abundance,20171102_genome_size_normalized_metagenome_cells/ml,20181030_genome_size_normalized_metagenome_cells/ml,20190402_genome_size_normalized_metagenome_cells/ml,20190709_genome_size_normalized_metagenome_cells/ml
1,AAA164-E04,0.000543,0.001322,0.000225,0.000294,842.193966,1916.455808,369.342772,857.676906
2,AAA536-G10,0.011273,0.011666,0.008782,0.006069,17472.787073,16915.541760,14403.280557,17721.566151
3,AG-337-I02,0.001437,0.006798,0.000656,0.000343,2227.335509,9857.221878,1075.339238,1002.756455
5,AG-414-E02,0.000081,0.000303,0.000245,0.000044,125.191537,438.765267,401.157950,128.286002
6,AG-422-B15,0.001213,0.002285,0.008358,0.002307,1880.679178,3313.158567,13707.625915,6736.371699
...,...,...,...,...,...,...,...,...,...
296,Verrucomicrobiales,0.000172,0.000274,0.000033,0.000096,265.899026,397.482106,54.179686,280.890811
297,Vibrio,0.000401,0.000428,0.000325,0.000330,622.265686,620.661123,533.774653,964.925624
298,Vicingaceae,0.000505,0.000324,0.004361,0.001741,782.480532,470.155212,7151.270318,5084.185705
299,Winogradskyella,0.002990,0.001380,0.006467,0.003672,4635.264065,2001.192608,10605.994111,10722.976925


In [94]:
# I need to redo the key file and run the last two cells after I have done that.

columns=['GTDB_genus', 'full_GTDBtk_classification', '#_of_cells','genus_size_rank']
GoM_GEN_key=pd.read_csv('//mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/Summary_files/GTDB_GoM_SAG_classification_key.csv', usecols=columns)
GoM_GEN_key['GTDB_genus']=GoM_GEN_key['GTDB_genus'].astype(str)

GoM_GEN_key

Unnamed: 0,GTDB_genus,full_GTDBtk_classification,#_of_cells,genus_size_rank
0,Unclassified,Unclassified,1790,
1,Pelagibacter,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,628,1.0
2,SW10,d__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,326,2.0
3,SCGC-AAA076-P13,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,178,3.0
4,D2472,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,173,4.0
...,...,...,...,...
298,UBA9214,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,1,298.0
299,UBA974,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,1,299.0
300,UBP15,d__Bacteria;p__UBP15;c__;o__;f__;g__;s__,1,300.0
301,Vibrio,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,1,301.0


In [95]:
df_abundance=df_abundance.merge(GoM_GEN_key, left_on='genus', right_on='GTDB_genus', how='left')
df_abundance.to_csv('/mnt/scgc/simon/microg2p/analyses/20210325_GoM_recluster/20210325_GoM_recluster_analysis/GoM_Metagenome_analysis/GoM_metagenome_calculated_Genus_cells_per_ml.csv', index=False)
df_abundance

Unnamed: 0,genus,20171102_genome_size_normalized_metagenome_abundance,20181030_genome_size_normalized_metagenome_abundance,20190402_genome_size_normalized_metagenome_abundance,20190709_genome_size_normalized_metagenome_abundance,20171102_genome_size_normalized_metagenome_cells/ml,20181030_genome_size_normalized_metagenome_cells/ml,20190402_genome_size_normalized_metagenome_cells/ml,20190709_genome_size_normalized_metagenome_cells/ml,GTDB_genus,full_GTDBtk_classification,#_of_cells,genus_size_rank
0,AAA164-E04,0.000543,0.001322,0.000225,0.000294,842.193966,1916.455808,369.342772,857.676906,AAA164-E04,d__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,37.0,25.0
1,AAA536-G10,0.011273,0.011666,0.008782,0.006069,17472.787073,16915.541760,14403.280557,17721.566151,AAA536-G10,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,48.0,17.0
2,AG-337-I02,0.001437,0.006798,0.000656,0.000343,2227.335509,9857.221878,1075.339238,1002.756455,AG-337-I02,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,14.0,51.0
3,AG-414-E02,0.000081,0.000303,0.000245,0.000044,125.191537,438.765267,401.157950,128.286002,AG-414-E02,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,1.0,176.0
4,AG-422-B15,0.001213,0.002285,0.008358,0.002307,1880.679178,3313.158567,13707.625915,6736.371699,AG-422-B15,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,14.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,Verrucomicrobiales,0.000172,0.000274,0.000033,0.000096,265.899026,397.482106,54.179686,280.890811,Verrucomicrobiales,d__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,5.0,107.0
164,Vibrio,0.000401,0.000428,0.000325,0.000330,622.265686,620.661123,533.774653,964.925624,Vibrio,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,1.0,301.0
165,Vicingaceae,0.000505,0.000324,0.004361,0.001741,782.480532,470.155212,7151.270318,5084.185705,Vicingaceae,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,3.0,138.0
166,Winogradskyella,0.002990,0.001380,0.006467,0.003672,4635.264065,2001.192608,10605.994111,10722.976925,Winogradskyella,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,9.0,77.0
