# Extract accessions from Excel table:
Read the table:

In [1]:
import natsort

import pandas as pd

table = pd.read_excel('Supplementary_Table_1.xlsx',
                      engine='openpyxl')

Get BioProject of interest:

In [2]:
prj = 'PRJNA612766'

acc_df = (
    table.query('BioProject == @prj')
    [['Run', 'Sample Name', 'Collection_Date', 'Host', 'collected_by']]
    )

acc_df.groupby(['Host', 'collected_by', 'Collection_Date']).size()

Host          collected_by                         Collection_Date
Homo sapiens  Renmin Hospital of Wuhan University  12-Feb-2020         31
                                                   15-Jan-2020          2
                                                   30-Jan-2020         96
Plasmid       Aisi Fu                              12-Feb-2020        112
dtype: int64

Get accessions of interest:

In [3]:
acc_of_interest_df = (
 acc_df
 .query('Host == "Homo sapiens"')
 .query('not `Sample Name`.str.startswith("PC")')
 .query('not `Sample Name`.str.startswith("NC")')
 .assign(sample=lambda x: x['Sample Name'].str.split('-').str[0])
 .query('sample != "respiratory viruses"')
 .groupby(['sample', 'collected_by', 'Collection_Date'], as_index=False)
 .aggregate(accessions=pd.NamedAgg('Run', lambda g: list(g)))
 .sort_values('sample', key=natsort.natsort_keygen())
 .assign(collection_date=lambda x: x['Collection_Date'].where(x['sample'].str.startswith('R'), 'early in epidemic'))
 .drop(columns='Collection_Date')
 )

acc_of_interest_df

Unnamed: 0,sample,collected_by,accessions,collection_date
0,A1,Renmin Hospital of Wuhan University,"[SRR11313282, SRR11313334]",early in epidemic
4,A2,Renmin Hospital of Wuhan University,"[SRR11313271, SRR11313461]",early in epidemic
5,A3,Renmin Hospital of Wuhan University,"[SRR11313439, SRR11313450]",early in epidemic
6,A4,Renmin Hospital of Wuhan University,"[SRR11313417, SRR11313428]",early in epidemic
7,A5,Renmin Hospital of Wuhan University,"[SRR11313472, SRR11313501]",early in epidemic
...,...,...,...,...
56,R12,Renmin Hospital of Wuhan University,"[SRR11313492, SRR11313493]",12-Feb-2020
57,R13,Renmin Hospital of Wuhan University,"[SRR11313489, SRR11313491]",12-Feb-2020
58,R14,Renmin Hospital of Wuhan University,"[SRR11313487, SRR11313488]",12-Feb-2020
59,R15,Renmin Hospital of Wuhan University,"[SRR11313485, SRR11313486]",12-Feb-2020


Write to YAML for configuration file:

In [4]:
lines = ['# -----------------------------------------------------------------',
         f"# BioProject {prj}",
         '# -----------------------------------------------------------------',
         ]
for _, row in acc_of_interest_df.iterrows():
    d = row.to_dict()
    lines += [f"{d['sample']}_{prj}:",
              f"  accessions: {d['accessions']}",
              f"  collected_by: {d['collected_by']}",
              f"  collection_date: {d['collection_date']}"
              ]
with open(f"for_config.yml", 'w') as f:
    f.write('  ' + '\n  '.join(lines))