# Setup

In [None]:
import os
import gzip
import pickle

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

In [None]:
CD_HIT_RESULTS = '../../data/processed/cd-hit-results/'
CURR_CD_HIT_SIM = '../../data/processed/cd-hit-results/sim80/'

CLSTR_FILE = os.path.join(CURR_CD_HIT_SIM, 'Ebacter_nr.faa.cdhit.clstr')

assert os.path.isfile(CLSTR_FILE)

In [None]:
with gzip.open('../../data/processed/cd-hit-results/header_to_allele_80.pickle.gz', 'rb') as f:
    header_to_allele = pickle.load(f)

In [None]:
df_genes = pd.read_pickle(
    '../../data/processed/cd-hit-results/sim80/Ebacter_strain_by_gene.pickle.gz'
)

display(
    df_genes.shape,
    df_genes.head()
)

In [None]:
REP_HEADERS = os.path.join(CD_HIT_RESULTS, 'rep_headers.txt')
REP_ALLELES = os.path.join(CD_HIT_RESULTS, 'rep_alleles.txt')

display(
    REP_HEADERS,
    REP_ALLELES
)

In [None]:
# # Already run once, no need to run again
# # Find all representative gene headers (as determind by CD-HIT)
cmd = f'grep "*" {CLSTR_FILE} | grep -oe ">.*" > {REP_HEADERS}'

os.system(cmd)

In [None]:
rep_headers = list()

with open(REP_HEADERS) as f:
    rep_headers.extend(f.readlines())

rep_headers = [x[1:-6] for x in rep_headers]

In [None]:
# Check first few lines
rep_headers[:10]

# Generate representative gene cluster FASTA collection (for eggNOG annotation)

In [None]:
# # Already run once
# # Convert rep_headers python list to text file (each element on newline)
rep_alleles = [header_to_allele[header] for header in rep_headers]

with open(REP_ALLELES, 'w') as f:
    file_content = '\n'.join(rep_alleles)
    f.write(file_content)

In [None]:
len(rep_alleles)

In [None]:
len(set(rep_alleles))

Note: if the above numbers do not match, it indicates there may be an issue with the bakta annotation with multiple genomes having identical gene tags associated with different sequences, causing incorrect naming of different alleles and genes. If these genomes can be identified, re-annotating them can be possible while if these examples cannot be found, ensure all of the annotations for bakta are run correctly on the same bakta version for all samples. 

__The following awk script from this__ [StackOverflow link](https://stackoverflow.com/a/49737831/8582436)

In [None]:
# # already run
# # Use awk to generate representative gene cluster FASTA collection
!awk -F'>' 'NR==FNR{ids[$0]; next} NF>1{f=($2 in ids)} f' ../../data/processed/cd-hit-results/rep_alleles.txt ../../data/processed/cd-hit-results/sim80/Ebacter_nr.faa > ../../data/processed/cd-hit-results/rep_alleles_nr.faa


# Execute new eggNOG annotation

__The following are run in a linux terminal session:__

`tmux new -s 'eggNOG-annot'`

`conda activate emapper` [installation instructions](https://github.com/eggnogdb/eggnog-mapper)

`python emapper.py -o Ebacter --tax_scope Gammaproteobacteria --tax_scope_mode Bacteria -i ../../data/processed/cd-hit-results/rep_alleles_nr.faa --output_dir ../../data/processed/eggNOG --cpu 20`

# Postprocess resultant file

In [None]:
# Read in file, skipping the first 4 rows (not needed)
df_eggnog = pd.read_csv(
    '../../data/processed/eggNOG/Ebacter.emapper.annotations',
    sep='\t',
    skiprows=4
)

# Remove the last 3 rows (not needed)
df_eggnog = df_eggnog[:-3]

# Rename "#query" to "allele"
df_eggnog.rename(columns={'#query': 'allele'}, inplace=True)

# Add in gene column
df_eggnog['gene'] = df_eggnog.allele.apply(lambda x: x.split('A')[0])
df_eggnog

# Set gene as the index
df_eggnog.set_index('gene', inplace=True)
df_eggnog
print(f'initial shape: {df_eggnog.shape}')

df_eggnog

In [None]:
len(set(df_genes.index) - set(df_eggnog.index)) # genes which were dropped by eggNOG (no hits)

In [None]:
assert len(set(df_eggnog.index) - set(df_genes.index)) == 0 # genes in eggNOG which aren't in the pangenome (should be zero)

In [None]:
# Add in dropped genes (genes which eggNOG drops because no OG could be found)
df_drop = pd.DataFrame(index=sorted(set(df_genes.index) - set(df_eggnog.index)), columns=df_eggnog.columns)
df_drop.index.name = 'gene'
df_drop.fillna('-', inplace=True)
df_eggnog = pd.concat([df_eggnog, df_drop])
print(f'final shape: {df_eggnog.shape}')

In [None]:
df_eggnog

# Save eggNOG annotations

In [None]:
df_eggnog.to_csv('../../data/processed/df_eggnog.csv')