# GWAS Preparation

In [1]:
pwd()

'/home/hnasrulloh/Works/pangenomes/notebooks'

In [2]:
from os import path
import random

import pandas as pd

random.seed(1)

In [3]:
DATA_DIR = path.join(path.pardir, 'data')

SAMPLES_DIR = path.join(DATA_DIR, 'samples')
GWAS_DIR = path.join(DATA_DIR, 'gwas')

## Generating _pheno_ tsv files

In [4]:
AMR_EXCEL = path.join(DATA_DIR, 'amr_ready.xlsx')
df = pd.read_excel(AMR_EXCEL, dtype=str)

df = df.loc[:, ['genome_id', 'antibiotic', 'phenotype']]

pheno = { 'R': 1, 'S': 0 }
df['phenotype'] = df['phenotype'].map(pheno)

# Match the samples name within gene_presence_absence.Rtab
samples = df.genome_id.map(lambda name: name + '.PATRIC')
df.insert(0, 'samples', samples)
df = df.drop(['genome_id'], axis=1)

for antibiotic in ['ethambutol', 'rifampin', 'isoniazid']:
    df_ab = df[df.antibiotic == antibiotic]
    df_ab = df_ab.drop(['antibiotic'], axis=1)
    df_ab.columns = ['samples', antibiotic]

    ANTIBIOTIC_TSV = path.join(DATA_DIR, f'pheno_{antibiotic}.tsv')
    df_ab.to_csv(ANTIBIOTIC_TSV, index=False, sep='\t')

## KITSUNE preparation

In [5]:
GENOME_IDS = path.join(SAMPLES_DIR, '_id_ready.txt') 
with open(GENOME_IDS, 'r') as f:
    ids = f.readlines()
    ids = [i.strip() for i in ids]

ids = [f'{i}.fna' for i in ids]

KITSUNE_LIST = path.join(SAMPLES_DIR, '_kitsune.txt')
with open(KITSUNE_LIST, 'w') as f:
    for i in ids:
        f.write(f'{i}\n')
        
KITSUNE_NAIVE = path.join(SAMPLES_DIR, '_kitsune_naive.txt')
with open(KITSUNE_NAIVE, 'w') as f:
    n = round(len(ids) * 1/100)
    naive_ids = random.sample(ids, n)
    for i in naive_ids:
        f.write(f'{i}\n')

In [6]:
print(f'Size {len(ids)}')
print(f'Size naive {n}')
ids[:10]

Size 669
Size naive 7


['83332.12.fna',
 '1438833.3.fna',
 '1438835.3.fna',
 '1438837.3.fna',
 '1438838.3.fna',
 '1438842.3.fna',
 '1438844.3.fna',
 '1438845.3.fna',
 '1438846.3.fna',
 '1438848.3.fna']

## fsm-lite (_k-mers_ counting) preparation

In [7]:
PANAROO_ETHAMBUTOL = path.join(SAMPLES_DIR, '_panaroo_ethambutol_ready.txt') 
PANAROO_RIFAMPIN = path.join(SAMPLES_DIR, '_panaroo_rifampin_ready.txt') 
PANAROO_ISONIAZID = path.join(SAMPLES_DIR, '_panaroo_isoniazid_ready.txt') 

panaroo = {
    'ethambutol': PANAROO_ETHAMBUTOL,
    'rifampin': PANAROO_RIFAMPIN,
    'isoniazid': PANAROO_ISONIAZID,
}

for antibiotic in panaroo:
    with open(panaroo[antibiotic], 'r') as f:
        inputs = f.readlines()
        genomes = [i.split()[1] for i in inputs]
        # genomes.remove('83332.12.fna') # remove the H3Rv reference genome

    FSM_LIST = path.join(SAMPLES_DIR, f'_fsm_{antibiotic}.txt')
    with open(FSM_LIST, 'w') as f:
        for i in genomes:
            f.write(f'{i}\n')