In [3]:
import numpy as np
import pandas as pd

import scripts
import os
import subprocess
import importlib

In [4]:
importlib.reload(scripts)

<module 'scripts' from '/net/holy-nfsisilon/ifs/rc_labs/eddy_lab/users/lmerk/phage_groupII/scripts.py'>

In [5]:
data_dir = 'data/'

# Infernal subdir
infernal_dir = os.path.join(data_dir, 'infernal/')
g1_intron_hits = os.path.join(infernal_dir, 'g1_intron_millard.tblout')
g2_intron_hits = os.path.join(infernal_dir, 'g2_intron_millard.tblout')

# Genomes subdir
genomes_dir = os.path.join(data_dir, 'genomes/')
metadata_dir = os.path.join(genomes_dir, 'inphared_metadata/')
metadata_path = os.path.join(metadata_dir, '14Dec2023_data.tsv')
metadata = pd.read_csv(metadata_path, sep='\t').rename(columns={'Accession': 'target_name'})
genbank_dir = os.path.join(genomes_dir, 'groupII_millard/')
actual_genomes = pd.read_csv(f'{genomes_dir}actual_genomes.txt', header=None)[0].unique()

# Window and defense
window_dir = os.path.join(genomes_dir, 'gII_intron_5kb_windows/')
bakta_dir = os.path.join(data_dir, 'bakta_output')
pharokka_dir = os.path.join(data_dir, "pharokka_output")
defense_dir = os.path.join(data_dir, 'defensefinder/bakta_defense')

# Script output subdir
script_output = os.path.join(data_dir, 'script_output/')
genbank_out_directory = os.path.join(script_output, "updated_genomes")

In [6]:
preprocess = True

______

In [7]:
if preprocess:
    g1_hits = scripts.extract_hit_df(g1_intron_hits)
    g2_hits = scripts.extract_hit_df(g2_intron_hits)

    g1_df = pd.merge(g1_hits, metadata, on='target_name', how='left')
    g2_df = pd.merge(g2_hits, metadata, on='target_name', how='left')
    
    print('Dereplicating group I...')
    g1_hits_pass, g1_hits_fail = scripts.dereplicate_hits(g1_df)

    print('Saving group I...')
    # Save all the hits
    g1_df.sort_values('target_name').to_csv(f'{script_output}g1_df.csv', index=False)
    # Save the ones that didn't pass the filter
    g1_hits_fail.to_csv(f'{script_output}g1_hits_fail.csv', index=False)
    # Clean the passes, add the intronID, then save it
    g1_hits_pass = g1_hits_pass.sort_values(by=['target_name', 'seq_from'])
    g1_hits_pass['intronID'] = g1_hits_pass['target_name'] + '_I_' + g1_hits_pass['seq_from'].astype(str)
    column_order = ['intronID'] + [col for col in g1_hits_pass.columns if col != 'intronID']
    g1_hits_pass = g1_hits_pass[column_order]
    g1_hits_pass.to_csv(f'{script_output}g1_hits_pass.csv', index=False)
    
    
    print('Collapsing group II...')
    g2_hits_pass = scripts.collapse_hits(g2_df)
    
    print('Saving group II...')
    g2_hits_pass = g2_hits_pass.sort_values(by=['target_name', 'seq_from'])
    g2_hits_pass['intronID'] = g2_hits_pass['target_name'] + '_II_' + g2_hits_pass['seq_from'].astype(str)
    column_order = ['intronID'] + [col for col in g2_hits_pass.columns if col != 'intronID']
    g2_hits_pass = g2_hits_pass[column_order]
    g2_hits_pass.to_csv(f'{script_output}g2_hits_pass.csv', index=False)

    # Save all the hits
    g2_df = g2_df.sort_values(by=['target_name', 'seq_from'])
    g2_df['intronID'] = g2_df['target_name'] + '_II_' + g2_df['seq_from'].astype(str)
    column_order = ['intronID'] + [col for col in g2_df.columns if col != 'intronID']
    g2_df = g2_df[column_order]
    g2_df.to_csv(f'{script_output}g2_df.csv', index=False)
    print('Done!')
    
else:
    g1_df = pd.read_csv(f'{script_output}g1_df.csv')
    g2_df = pd.read_csv(f'{script_output}g2_df.csv')

    g1_hits_pass = pd.read_csv(f'{script_output}g1_hits_pass.csv')
    g2_hits_pass = pd.read_csv(f'{script_output}g2_hits_pass.csv')

Dereplicating group I...
Saving group I...
Collapsing group II...
Saving group II...
Done!


In [8]:
# This will submit jobs to slurm to hmmscan the surrounding window
# The genomes should be indexed (i.e. contain a .idx file in the dir)
# If not, you can use esl-index.sh

if preprocess:
    scripts.hmmscan_window(g2_df, genbank_dir, window_dir)

In [9]:
# Note you need to make sure all your jobs from above have finished before running this
# You also need to run the following bash scripts before running this:
# pharokka.sh in pharokka env to generate annotations
# bakta.sh in bakta env to generate annotations
# defensefinder.sh in defensefinder env to get defense

if preprocess:
    all_defense_hits = scripts.combine_defense(g2_df, bakta_dir, defense_dir)
    all_hit_df = scripts.hit_to_genome_with_domtbl(g2_df, window_dir, genbank_dir)
    all_hit_df_derep = all_hit_df.loc[all_hit_df['genome'].isin(actual_genomes)]
    all_hit_df_derep.to_csv(f'{script_output}g2_genes_top3_5kb_with_locs.csv', index=False)
    scripts.combine_genbanks(all_hit_df_derep, pharokka_dir, bakta_dir, genbank_out_directory, actual_genomes)
    scripts.update_genbank(genbank_out_directory, all_hit_df_derep, g1_hits_pass, g2_df, all_defense_hits)
else:
    all_hit_df_derep =  pd.read_csv(f'{script_output}g2_genes_top3_5kb_with_locs.csv')