In [1]:
import pandas as pd
import pickle
import re

# Function to read HMMER .tblout file into a pandas DataFrame
def read_tblout_to_dataframe(tblout_file):
    # Define column names based on HMMER .tblout format
    column_names = [
        'target_name', 'accession', 'query_name', 'accession_query', 
        'e_value', 'score', 'bias', 'e_value_best_dom', 
        'score_best_dom', 'bias_best_dom', 'exp', 'reg', 
        'clu', 'ov', 'env', 'dom', 'rep', 'inc', 'description'
    ]
    
    # Read the file, skipping lines starting with '#'
    data = []
    with open(tblout_file, 'r') as f:
        for line in f:
            if not line.startswith("#"):  # Skip comment lines
                # Split the line by any whitespace and limit to the first 18 fields
                fields = line.strip().split(maxsplit=18)
                data.append(fields)

    # Create a pandas DataFrame
    df = pd.DataFrame(data, columns=column_names)
    
    df['genome'] = df['target_name'].str.split('_CDS').str[0]

    # Convert the 'e_value' column to float for numerical filtering
    df['e_value'] = pd.to_numeric(df['e_value'], errors='coerce')
        
    # Step 1: Identify duplicated genomes
    duplicated_genomes = df[df['genome'].duplicated(keep=False)]

    # Step 2: For duplicated genomes, keep only the row with the lowest 'e_value'
    lowest_evalue_duplicates = duplicated_genomes.loc[duplicated_genomes.groupby('genome')['e_value'].idxmin()]

    # Step 3: Get the rows where genome is not duplicated (keep as they are)
    unique_genomes = df[~df['genome'].duplicated(keep=False)]

    # Step 4: Combine both unique genomes and lowest e_value duplicates
    final_df = pd.concat([lowest_evalue_duplicates, unique_genomes], ignore_index=True)

    return final_df.loc[final_df['e_value'] <= 0.001]

tblout_file = 'PF03237_millard_imgvr.tblout'
df = read_tblout_to_dataframe(tblout_file)

In [2]:
len(df.genome.unique())

670

In [3]:
df.genome.to_csv('headers.txt', index=False, header=False)
tblout_final = df.loc[df.genome.isin(df.genome.unique())]

In [4]:
# Step 1: Identify duplicated genomes
duplicated_genomes = tblout_final[tblout_final['e_value'].duplicated(keep=False)]

# Step 2: For duplicated genomes, keep only the row with the lowest 'e_value'
lowest_evalue_duplicates = duplicated_genomes.loc[duplicated_genomes.groupby('genome')['e_value'].idxmin()]

In [5]:
lowest_evalue_duplicates.target_name.to_csv('proteins_to_pull.txt', index=False, header=False)

In [6]:
!esl-sfetch -f "/n/eddy_lab/users/lmerk/phage_groupII/IMGVR/all_imgvr_hit_genome_annotations/phanotate.faa" "proteins_to_pull.txt" > "Terminase_6N_imgvr.faa"


## Run clustering on these proteins, then choose a representative from each cluster

This is done with esl-msacluster with a binary to determine PID that yields 30 clusters. For this it was 17.5%.

In [7]:
cluster_dict = pickle.load(open('terL_30_clusters.pkl', 'rb'))

In [8]:
one_per_cluster = {key: values[0] for key, values in cluster_dict.items()}
imgvr_reps = list(one_per_cluster.values())

In [9]:
imgvr_rep_df = df.loc[df.genome.isin(imgvr_reps)]

In [10]:
imgvr_rep_df.target_name.to_csv('proteins_to_pull_cluster_rep.txt', index=False, header=False)

In [11]:
!esl-sfetch -f "/n/eddy_lab/users/lmerk/phage_groupII/IMGVR/all_imgvr_hit_genome_annotations/phanotate.faa" "proteins_to_pull_cluster_rep.txt" > "Terminase_6N_imgvr_cluster_rep.faa"


In [12]:
# Input and output file paths
input_faa = "Terminase_6N_imgvr_cluster_rep.faa"
output_faa = "Terminase_6N_imgvr_cluster_rep_single_name.faa"

# Function to rename headers
def rename_headers(fasta_file, output_file):
    with open(fasta_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            if line.startswith(">"):
                # Extract only the part before the first underscore
                new_header = re.match(r">(.*?)_CDS", line).group(1)
                outfile.write(f">{new_header}\n")
            else:
                outfile.write(line)

# Run the renaming function
rename_headers(input_faa, output_faa)

print(f"Headers renamed and saved to {output_faa}")

Headers renamed and saved to Terminase_6N_imgvr_cluster_rep_single_name.faa
