In [1]:
# Generate read counts on MAGs
# Jackson M. Tsuji, 2021
# This is a outside-atlas replacement for combine_coverages_MAGs

In [2]:
import pandas as pd

In [3]:
contig_counts = pd.read_csv('read_counts_contigs.tsv.gz', sep='\t')
contig_counts.head()

Unnamed: 0,count,contig-id,rep-id
0,538,MAG001_40,Jun2018_L221_05m_A
1,2,MAG001_73,Jun2018_L221_05m_A
2,12,MAG001_121,Jun2018_L221_05m_A
3,4,MAG001_176,Jun2018_L221_05m_A
4,11,MAG002_1,Jun2018_L221_05m_A


In [6]:
# Get MAG ID
contig_counts['MAG'] = contig_counts['contig-id'].str.split('_', expand=True)[0]

contig_counts.head()

Unnamed: 0,count,contig-id,rep-id,MAG
0,538,MAG001_40,Jun2018_L221_05m_A,MAG001
1,2,MAG001_73,Jun2018_L221_05m_A,MAG001
2,12,MAG001_121,Jun2018_L221_05m_A,MAG001
3,4,MAG001_176,Jun2018_L221_05m_A,MAG001
4,11,MAG002_1,Jun2018_L221_05m_A,MAG002


In [8]:
# Get MAG counts
mag_counts = contig_counts.groupby(['MAG','rep-id']).agg({'count':'sum'})\
  .reset_index()

mag_counts.head()

Unnamed: 0,MAG,rep-id,count
0,MAG001,Jun2018_L221_05m_A,556
1,MAG001,Jun2018_L221_05m_B,498
2,MAG001,Jun2018_L221_05m_C,440
3,MAG001,Jun2018_L304_06m_A,809
4,MAG001,Jun2018_L304_06m_B,1062


In [21]:
# Match format with ATLAS
combined_coverages_MAGs = mag_counts\
  .rename(columns={'rep-id':'Sample'})\
  .pivot(index='MAG', columns='Sample', values='count')\
  .fillna(0)\
  .reset_index()\
  .rename(columns={'MAG':'Sample'}) # This is an odd naming structure that ATLAS uses

combined_coverages_MAGs.head()

Sample,Sample.1,Jun2018_L221_05m_A,Jun2018_L221_05m_B,Jun2018_L221_05m_C,Jun2018_L304_06m_A,Jun2018_L304_06m_B,Jun2018_L304_06m_C
0,MAG001,556.0,498.0,440.0,809.0,1062.0,1084.0
1,MAG002,370.0,276.0,226.0,499.0,446.0,454.0
2,MAG003,96.0,84.0,82.0,8884.0,9842.0,9804.0
3,MAG004,551.0,502.0,484.0,474.0,708.0,638.0
4,MAG005,0.0,0.0,0.0,128.0,178.0,120.0


In [22]:
combined_coverages_MAGs.to_csv('raw_counts_genomes.tsv', sep='\t', index=False)