In [1]:
import pandas as pd
import os

In [8]:

def create_fasta_files(csv_filename):
    """
    Create two FASTA files from CSV based on pairing scores.
    Removes duplicates based on NT_Trimmed column.
    Each entry includes both light chain (NT_Trimmed) and heavy chain (input_heavy_sequence).
    
    Args:
        csv_filename (str): Path to the input CSV file
    """
    
    try:
        # Read the CSV file
        print(f"Reading CSV file: {csv_filename}")
        df = pd.read_csv(csv_filename)
        
        print(f"Initial number of rows: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
        # Check for required columns
        required_columns = ['fasta_id', 'NT_Trimmed', 'input_heavy_sequence', 'pairing_scores']
        missing_columns = [col for col in required_columns if col not in df.columns]
        
        if missing_columns:
            print(f"Error: Missing required columns: {missing_columns}")
            return
        
        # Remove rows with missing data in required columns
        initial_count = len(df)
        df = df.dropna(subset=required_columns)
        print(f"Rows after removing missing data: {len(df)} (removed {initial_count - len(df)})")
        
        # Remove duplicates based on NT_Trimmed column
        # Keep the first occurrence of each unique sequence
        pre_dedup_count = len(df)
        df = df.drop_duplicates(subset=['NT_Trimmed'], keep='first')
        duplicates_removed = pre_dedup_count - len(df)
        print(f"Rows after removing duplicates: {len(df)} (removed {duplicates_removed} duplicates)")
        
        # Convert pairing_scores to numeric, handle any conversion errors
        df['pairing_scores'] = pd.to_numeric(df['pairing_scores'], errors='coerce')
        
        # Remove rows where pairing_scores couldn't be converted
        df = df.dropna(subset=['pairing_scores'])
        print(f"Final rows for processing: {len(df)}")
        
        # Split data based on pairing scores
        high_scores = df[df['pairing_scores'] >= 0.5]
        low_scores = df[df['pairing_scores'] < 0.5]
        
        print(f"High pairing scores (>=0.5): {len(high_scores)} sequences")
        print(f"Low pairing scores (<0.5): {len(low_scores)} sequences")
        
        # Function to create FASTA content with both light and heavy chains
        def create_fasta_content(dataframe):
            fasta_entries = []
            for _, row in dataframe.iterrows():
                fasta_id = row['fasta_id']
                light_sequence = row['NT_Trimmed']
                heavy_sequence = row['input_heavy_sequence']
                
                # Add light chain entry
                fasta_entries.append(f">protein|{fasta_id}")
                fasta_entries.append(light_sequence)
                
                # Add heavy chain entry
                fasta_entries.append(f">protein|heavy_{fasta_id}")
                fasta_entries.append(heavy_sequence)
                
            return '\n'.join(fasta_entries) + '\n'
        
        # Create FASTA files
        high_scores_fasta = create_fasta_content(high_scores)
        low_scores_fasta = create_fasta_content(low_scores)
        
        # Write high pairing scores FASTA file
        high_filename = 'high_pairing_scores_immunomatch_bert2gpt_trimmed.fasta'
        with open(high_filename, 'w') as f:
            f.write(high_scores_fasta)
        print(f"Created: {high_filename}")
        
        # Write low pairing scores FASTA file
        low_filename = 'low_pairing_scores_immunomatch_bert2gpt_trimmed.fasta'
        with open(low_filename, 'w') as f:
            f.write(low_scores_fasta)
        print(f"Created: {low_filename}")
        
        # Display some statistics
        print("\n=== SUMMARY ===")
        print(f"Total unique sequences processed: {len(df)}")
        print(f"High pairing scores (>=0.5): {len(high_scores)} sequences ({len(high_scores) * 2} FASTA entries)")
        print(f"Low pairing scores (<0.5): {len(low_scores)} sequences ({len(low_scores) * 2} FASTA entries)")
        print(f"Duplicates removed: {duplicates_removed}")
        
        # Show first few entries from each file as preview
        print(f"\n=== PREVIEW OF {high_filename} ===")
        if len(high_scores) > 0:
            preview_lines = high_scores_fasta.split('\n')[:8]  # First 2 pairs (8 lines)
            print('\n'.join(preview_lines))
            if len(high_scores) > 2:
                print("...")
        else:
            print("No sequences with high pairing scores.")
        
        print(f"\n=== PREVIEW OF {low_filename} ===")
        if len(low_scores) > 0:
            preview_lines = low_scores_fasta.split('\n')[:8]  # First 2 pairs (8 lines)
            print('\n'.join(preview_lines))
            if len(low_scores) > 2:
                print("...")
        else:
            print("No sequences with low pairing scores.")
            
    except FileNotFoundError:
        print(f"Error: File '{csv_filename}' not found.")
    except Exception as e:
        print(f"Error processing file: {str(e)}")


In [7]:
csv_file = "/ibmm_data2/oas_database/paired_lea_tmp/paired_model/immuno_match/immunomatch_results/pairing_result_bert2gpt_full_complete_ids_mapping_unique_nt_trimmed_gene_hit_locus.csv"

In [9]:
create_fasta_files(csv_file)

Reading CSV file: /ibmm_data2/oas_database/paired_lea_tmp/paired_model/immuno_match/immunomatch_results/pairing_result_bert2gpt_full_complete_ids_mapping_unique_nt_trimmed_gene_hit_locus.csv
Initial number of rows: 8388
Columns: ['Unnamed: 0', 'fasta_id', 'csv_row_index', 'csv_row_number', 'sequence_alignment_aa_light', 'generated_sequence_light', 'input_heavy_sequence', 'BLOSUM_score', 'similarity', 'perplexity', 'calculated_blosum', 'calculated_similarity', 'NT_Trimmed', 'Best_Gene', 'Best_Bit_Score', 'Best_E_Value', 'Locus', 'pairing_scores']
Rows after removing missing data: 8388 (removed 0)
Rows after removing duplicates: 8388 (removed 0 duplicates)
Final rows for processing: 8388
High pairing scores (>=0.5): 3712 sequences
Low pairing scores (<0.5): 4676 sequences
Created: high_pairing_scores_immunomatch_bert2gpt_trimmed.fasta
Created: low_pairing_scores_immunomatch_bert2gpt_trimmed.fasta

=== SUMMARY ===
Total unique sequences processed: 8388
High pairing scores (>=0.5): 3712 se