In [94]:
import pandas as pd
from collections import defaultdict
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO

def find_toxin_antitoxin_pairs(file_path):
    """
    Find toxin-antitoxin pairs within the same operon from TSV data.
    
    Args:
        file_path (str): Path to the TSV file
    
    Returns:
        list: List of dictionaries containing TA pair information
    """
    # Read the TSV file
    df = pd.read_csv(file_path, sep='\t')

    # Filter for rows that have TAS_info as 'T' or 'A'
    tas_df = df[df['TAS_info'].isin(['T', 'A'])].copy()

    # Group by operon_id
    operon_groups = tas_df.groupby('operon_id')

    ta_pairs = []

    for operon_id, group in operon_groups:
        # Get toxins and antitoxins in this operon
        toxins = group[group['TAS_info'] == 'T']
        antitoxins = group[group['TAS_info'] == 'A']

        # Create pairs - each toxin with each antitoxin in the same operon
        for _, toxin in toxins.iterrows():
            for _, antitoxin in antitoxins.iterrows():
                pair = {
                    'operon_id': operon_id,
                    'toxin_gene_id': toxin['ensembl_gene_id'],
                    'toxin_description': toxin['gene_description'],
                    'toxin_length': toxin['protein_length'],
                    'toxin_start': toxin['start'],
                    'toxin_end': toxin['end'],
                    'antitoxin_gene_id': antitoxin['ensembl_gene_id'],
                    'antitoxin_description': antitoxin['gene_description'],
                    'antitoxin_length': antitoxin['protein_length'],
                    'antitoxin_start': antitoxin['start'],
                    'antitoxin_end': antitoxin['end'],
                    'distance': abs(toxin['start'] - antitoxin['start']),
                    'toxin_sequence': toxin['protein_sequence'],
                    'antitoxin_sequence': antitoxin['protein_sequence']
                }
                ta_pairs.append(pair)

    return ta_pairs

def prepare_dataset_pairs(file_path, random_seed=42):
    """
    Create a dataframe with positive (same operon) and negative (different operon) TA pairs.
    
    Args:
        file_path (str): Path to the TSV file
        random_seed (int): Random seed for reproducibility
    
    Returns:
        pd.DataFrame: DataFrame with columns ['protein1', 'protein2', 'label']
    """
    import random
    random.seed(random_seed)
    
    # Read the TSV file
    df = pd.read_csv(file_path, sep='\t')
    
    # Filter for rows that have TAS_info as 'T' or 'A'
    tas_df = df[df['TAS_info'].isin(['T', 'A'])].copy()
    
    # Separate toxins and antitoxins
    toxins = tas_df[tas_df['TAS_info'] == 'T'].copy()
    antitoxins = tas_df[tas_df['TAS_info'] == 'A'].copy()
    
    labeled_pairs = []
    
    # 1. Create positive pairs (label = 1) - same operon
    operon_groups = tas_df.groupby('operon_id')
    
    for operon_id, group in operon_groups:
        operon_toxins = group[group['TAS_info'] == 'T']
        operon_antitoxins = group[group['TAS_info'] == 'A']
        
        # Create pairs within the same operon
        for _, toxin in operon_toxins.iterrows():
            for _, antitoxin in operon_antitoxins.iterrows():
                labeled_pairs.append({
                    'protein1': toxin['ensembl_gene_id'],
                    'protein2': antitoxin['ensembl_gene_id'],
                    'label': 1
                })
    
    # 2. Create negative pairs (label = 0) - different operons
    # For each toxin, randomly assign an antitoxin from a different operon
    for _, toxin in toxins.iterrows():
        toxin_operon = toxin['operon_id']
        
        # Get antitoxins from different operons
        different_operon_antitoxins = antitoxins[antitoxins['operon_id'] != toxin_operon]
        
        if not different_operon_antitoxins.empty:
            # Randomly select one antitoxin from different operon
            random_antitoxin = different_operon_antitoxins.sample(n=1).iloc[0]
            
            labeled_pairs.append({
                'protein1': toxin['ensembl_gene_id'],
                'protein2': random_antitoxin['ensembl_gene_id'],
                'label': 0
            })
    
    # Convert to DataFrame
    pairs_df = pd.DataFrame(labeled_pairs)
    
    return pairs_df

def save_sequences_to_fasta(file_path, output_fasta_path, pairs_df):
    """
    Save unique sequences from the dataset pairs into a FASTA file.

    Args:
        file_path (str): Path to the TSV file with original data.
        output_fasta_path (str): Path to save the FASTA file.
        pairs_df (pd.DataFrame): DataFrame with protein1 and protein2 columns.
    """
    # Read the original TSV file
    df = pd.read_csv(file_path, sep='\t')

    # Get all unique gene IDs from the pairs
    unique_gene_ids = pd.unique(pairs_df[['protein1', 'protein2']].values.ravel())

    # Filter for rows with those gene IDs
    sequence_df = df[df['ensembl_gene_id'].isin(unique_gene_ids)].drop_duplicates(subset='ensembl_gene_id')

    # Create SeqRecord objects
    records = []
    for _, row in sequence_df.iterrows():
        gene_id = row['ensembl_gene_id']
        sequence = row['protein_sequence']  # Assumes the column is named 'sequence'
        records.append(SeqRecord(Seq(sequence), id=gene_id, description=""))

    # Write to FASTA file
    SeqIO.write(records, output_fasta_path, "fasta")

def print_pairs_summary(ta_pairs):
    """Print a summary of found TA pairs."""
    print(f"Found {len(ta_pairs)} toxin-antitoxin pairs")
    print(f"Across {len(set(pair['operon_id'] for pair in ta_pairs))} operons")
    print("\nPair details:")
    print("-" * 100)

    for i, pair in enumerate(ta_pairs, 1):
        print(f"Pair {i}:")
        print(f"  Operon: {pair['operon_id']}")
        print(f"  Toxin: {pair['toxin_gene_id']} ({pair['toxin_description']}) - {pair['toxin_length']} aa")
        print(f"  Antitoxin: {pair['antitoxin_gene_id']} ({pair['antitoxin_description']}) - {pair['antitoxin_length']} aa")
        print(f"  Distance: {pair['distance']} bp")
        print()

def save_data_to_csv(df, output_file):
    """Save TA pairs to CSV file."""
    df.to_csv(output_file, index=False, sep="\t")
    print(f"TA pairs saved to {output_file}")


In [81]:
summary_table_path = "/scicore/home/schwede/pudziu0000/projects/gLM/data/TASmania/mycobacterium_tuberculosis/mycobacterium_tuberculosis_h37rv_summary_table.tsv"
data_dir = "/scicore/home/schwede/pudziu0000/projects/gLM/data/TASmania/mycobacterium_tuberculosis/"

In [4]:
ta_pairs = find_toxin_antitoxin_pairs(summary_table_path)

In [95]:
dataset = prepare_dataset_pairs(summary_table_path, random_seed=42)
save_sequences_to_fasta(summary_table_path, f"{data_dir}/sequences.fasta", dataset)
save_data_to_csv(dataset, f"{data_dir}/test.txt")

TA pairs saved to /scicore/home/schwede/pudziu0000/projects/gLM/data/TASmania/mycobacterium_tuberculosis//test.txt
