# Panaroo Preparation

In [1]:
pwd()

'/home/hnasrulloh/Works/pangenomes/notebooks'

In [2]:
from os import path
from collections import defaultdict

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

In [3]:
DATA_DIR = path.join(path.pardir, 'data')

SAMPLES_DIR = path.join(DATA_DIR, 'samples')
QC_DIR = path.join(DATA_DIR, 'qc')

In [4]:
NGENES_TXT = path.join(QC_DIR, 'ngenes.txt')

genes = pd.read_csv(NGENES_TXT, sep='\t', dtype={'sample': str, 'no_genes': np.float64})
genes.columns = ['genome_id', 'total_genes']
genes

Unnamed: 0,genome_id,total_genes
0,1267355.3,4365.0
1,1267356.3,4345.0
2,1267358.3,4358.0
3,1267359.3,4331.0
4,1267361.3,4359.0
...,...,...
506,1773.203,4312.0
507,1773.204,4269.0
508,652616.4,4372.0
509,83331.31,4346.0


In [5]:
upper_outlier = genes[genes['genome_id'] == '1402602.3'].total_genes.to_list()[0]
lower_outlier = genes[genes['genome_id'] == '1773.201'].total_genes.to_list()[0]

genes = genes[(genes.total_genes <= upper_outlier) & (genes.total_genes >= lower_outlier)]
genes

Unnamed: 0,genome_id,total_genes
0,1267355.3,4365.0
1,1267356.3,4345.0
2,1267358.3,4358.0
3,1267359.3,4331.0
4,1267361.3,4359.0
...,...,...
505,1773.202,4307.0
506,1773.203,4312.0
508,652616.4,4372.0
509,83331.31,4346.0


In [6]:
mds_outliers = ['1324246.3']

genes = genes[~genes.genome_id.isin(mds_outliers)]
genes

Unnamed: 0,genome_id,total_genes
0,1267355.3,4365.0
1,1267356.3,4345.0
2,1267358.3,4358.0
3,1267359.3,4331.0
4,1267361.3,4359.0
...,...,...
505,1773.202,4307.0
506,1773.203,4312.0
508,652616.4,4372.0
509,83331.31,4346.0


In [7]:
AMR_BEFORE_MASH = path.join(DATA_DIR, 'amr_before_mash.xlsx')
df_amr = pd.read_excel(AMR_BEFORE_MASH, dtype=str)

df_amr = df_amr[df_amr.genome_id.isin(genes.genome_id)]

AMR_READY = path.join(DATA_DIR, 'amr_ready.xlsx')
df_amr.to_excel(AMR_READY, index=False)

df_amr

Unnamed: 0,genome_id,genome_name,antibiotic,phenotype,measurement,measurement_unit,laboratory_typing_method
0,1438833.3,Mycobacterium tuberculosis 1010SM,rifampin,R,40,mg/L,Agar proportion
1,1438833.3,Mycobacterium tuberculosis 1010SM,streptomycin,R,4,mg/L,Agar proportion
2,1438835.3,Mycobacterium tuberculosis 1173CS,ethambutol,S,2,mg/L,Agar proportion
3,1438835.3,Mycobacterium tuberculosis 1173CS,rifampin,R,40,mg/L,Agar proportion
4,1438837.3,Mycobacterium tuberculosis 1429BH,ethambutol,S,2,mg/L,Agar proportion
...,...,...,...,...,...,...,...
1226,1773.201,Mycobacterium tuberculosis strain MAL010119,isoniazid,S,0.1,mg/L,Broth dilution
1227,1773.201,Mycobacterium tuberculosis strain MAL010119,rifampin,S,0.1,mg/L,Broth dilution
1228,1773.201,Mycobacterium tuberculosis strain MAL010119,streptomycin,S,2,mg/L,Agar dilution
1229,1773.202,Mycobacterium tuberculosis strain MAL020139,isoniazid,R,1,mg/L,Agar dilution


In [8]:
GENOMES_BEFORE_MASH = path.join(DATA_DIR, 'genomes_before_mash.xlsx')
df_genomes = pd.read_excel(GENOMES_BEFORE_MASH, dtype=str)

df_genomes = df_genomes[df_genomes.genome_id.isin(genes.genome_id)]

GENOMES_READY = path.join(DATA_DIR, 'genomes_ready.xlsx')
df_genomes.to_excel(GENOMES_READY, index=False)

df_genomes

Unnamed: 0,genome_id,genome_name,strain,taxon_id,genome_length,gc_content,fine_consistency,checkm_completeness,checkm_contamination,completion_date,...,sequences,patric_cds,refseq_cds,isolation_site,isolation_source,isolation_comments,collection_date,geographic_location,host_name,comments
0,1438833.3,Mycobacterium tuberculosis 1010SM,1010SM,1438833,4406079,65.6,99.3,100,0,2014-04-29T00:00:00Z,...,,4372,4006,,neg,,2013,Romania,"Human, Homo sapiens",TB-ARC - Romania
1,1438835.3,Mycobacterium tuberculosis 1173CS,1173CS,1438835,4383014,65.6,99.2,100,0,2014-04-29T00:00:00Z,...,,4333,3982,,Bodily fluid,,2012,Romania,"Human, Homo sapiens",TB-ARC - Romania
2,1438837.3,Mycobacterium tuberculosis 1429BH,1429BH,1438837,4417210,65.6,99.3,100,0,2014-04-29T00:00:00Z,...,,4384,4022,,Bodily fluid,,2013,Romania,"Human, Homo sapiens",TB-ARC - Romania
3,1438838.3,Mycobacterium tuberculosis 1430BH,1430BH,1438838,4418670,65.6,99.3,100,0,2014-04-29T00:00:00Z,...,,4383,4023,,Bodily fluid,,2013,Romania,"Human, Homo sapiens",TB-ARC - Romania
4,1438842.3,Mycobacterium tuberculosis 1755BN,1755BN,1438842,4394400,65.6,99.3,100,0,2014-04-29T00:00:00Z,...,,4360,3987,,Bodily fluid,,2013,Romania,"Human, Homo sapiens",TB-ARC - Romania
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,1773.199,Mycobacterium tuberculosis strain MAL010075,MAL010075,1773,4421633,65.6,99.3,100,0,2014-04-29T00:00:00Z,...,,4320,4006,,,,2007,Mali,"Human, Homo sapiens",TB-ARC - Mali
505,1773.200,Mycobacterium tuberculosis strain MAL010077,MAL010077,1773,4421308,65.6,99.3,100,0,2014-04-29T00:00:00Z,...,,4304,4006,,,,,Mali,"Human, Homo sapiens",TB-ARC - Mali
506,1773.201,Mycobacterium tuberculosis strain MAL010119,MAL010119,1773,4395080,65.6,98.9,100,0,2014-04-29T00:00:00Z,...,,4297,3994,,,,2010,Mali,"Human, Homo sapiens",TB-ARC - Mali
507,1773.202,Mycobacterium tuberculosis strain MAL020139,MAL020139,1773,4413724,65.6,99.4,100,0,2014-04-29T00:00:00Z,...,,4307,4003,,,,2007,Mali,"Human, Homo sapiens",TB-ARC - Mali


**REMEBER** to add the genome reference!
- Genome ID: **83332.12**
- Genome Name: **Mycobacterium tuberculosis H37Rv**

In [9]:
PANAROO_READY = path.join(SAMPLES_DIR, '_panaroo_ready.txt')
IDS_READY = path.join(SAMPLES_DIR, '_id_ready.txt')

f_panaroo = open(PANAROO_READY, 'w')
f_id = open(IDS_READY, 'w')

ids = ['83332.12']
ids.extend(genes.genome_id.to_list())
for gid in ids:
    f_panaroo.write(f'{gid}.PATRIC.gff {gid}.fna\n')
    f_id.write(f'{gid}\n')
    
f_panaroo.close()
f_id.close()

In [10]:
# Plot the distributions
dist = defaultdict(int)
for d in df_amr[['antibiotic', 'phenotype']].itertuples():
    name = f'{d.antibiotic}:{d.phenotype}'
    dist[name] += 1
    
antibiotics = set([a.split(':')[0] for a in dist.keys()])
counts = [(dist[f'{a}:R'], dist[f'{a}:S']) for a in antibiotics]
pd.DataFrame(counts, index=list(antibiotics), columns=['resistant', 'susceptible'])

Unnamed: 0,resistant,susceptible
ethambutol,40,97
isoniazid,235,172
streptomycin,82,258
rifampin,82,188
