In [1]:
import csv
import re

In [2]:
def fasta_to_csv(input_fasta, output_full_csv, output_heavy_sep_light_csv):
    # Initialize variables
    headers_full = ["BType", "light_locus", "v_gene_heavy", "v_gene_light", 
                   "j_gene_heavy", "j_gene_light", "heavy_sep_light", "light_seq"]
    headers_sep = ["heavy_sep_light"]
    csv_rows_full = []
    csv_rows_sep = []
    current_header = ""
    current_sequence = ""
    
    # Read the FASTA file
    with open(input_fasta, 'r') as fasta_file:
        for line in fasta_file:
            line = line.strip()
            if line.startswith('>'):
                # Process the previous entry if it exists
                if current_header and current_sequence:
                    # Parse the header to extract information
                    header_parts = current_header[1:].split('_')
                    btype = header_parts[0]
                    light_locus = header_parts[1]
                    v_gene_heavy = header_parts[2]
                    v_gene_light = header_parts[3]
                    j_gene_heavy = header_parts[4]
                    j_gene_light = header_parts[5]
                    
                    # Get heavy_sep_light (everything after the 6th underscore)
                    heavy_sep_light = '_'.join(header_parts[6:])
                    
                    # Add row to our full data
                    csv_rows_full.append([
                        btype, 
                        light_locus, 
                        v_gene_heavy, 
                        v_gene_light, 
                        j_gene_heavy, 
                        j_gene_light, 
                        heavy_sep_light, 
                        current_sequence
                    ])
                    
                    # Add row to our heavy_sep_light only data
                    csv_rows_sep.append([heavy_sep_light])
                
                # Start a new entry
                current_header = line
                current_sequence = ""
            else:
                # Append to the current sequence
                current_sequence += line
        
        # Don't forget to process the last entry
        if current_header and current_sequence:
            header_parts = current_header[1:].split('_')
            btype = header_parts[0]
            light_locus = header_parts[1]
            v_gene_heavy = header_parts[2]
            v_gene_light = header_parts[3]
            j_gene_heavy = header_parts[4]
            j_gene_light = header_parts[5]
            heavy_sep_light = '_'.join(header_parts[6:])
            
            csv_rows_full.append([
                btype, 
                light_locus, 
                v_gene_heavy, 
                v_gene_light, 
                j_gene_heavy, 
                j_gene_light, 
                heavy_sep_light, 
                current_sequence
            ])
            
            csv_rows_sep.append([heavy_sep_light])
    
    # Write to full CSV
    with open(output_full_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(headers_full)
        writer.writerows(csv_rows_full)
    
    # Write to heavy_sep_light only CSV
    with open(output_heavy_sep_light_csv, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(headers_sep)
        writer.writerows(csv_rows_sep)
    
    print(f"Converted {len(csv_rows_full)} sequences from FASTA to CSV.")
    print(f"Created heavy_sep_light CSV with {len(csv_rows_sep)} rows.")


In [3]:
pident = 90

input_dir = f"/ibmm_data2/oas_database/paired_lea_tmp/paired_model/check_pairings_gen_sequences/clustered_light_seqs_datasets/full_paired_oas_no_dupl_light_seqs_90_clu_rep_70_alloc/"



In [4]:
for file in [
    f"full_paired_oas_no_dupl_light_seqs_{pident}_clu_rep_70_alloc_train.txt",
    f"full_paired_oas_no_dupl_light_seqs_{pident}_clu_rep_70_alloc_test.txt",
    f"full_paired_oas_no_dupl_light_seqs_{pident}_clu_rep_70_alloc_val.txt"
]:
    input_fasta = f"{input_dir}{file}"
    output_full_csv = f"{input_dir}{file.replace('.txt', '_full.csv')}"
    output_heavy_sep_light_csv = f"{input_dir}{file.replace('.txt', '_heavy_sep_light.csv')}"
    
    fasta_to_csv(input_fasta, output_full_csv, output_heavy_sep_light_csv)

Converted 396984 sequences from FASTA to CSV.
Created heavy_sep_light CSV with 396984 rows.
Converted 49623 sequences from FASTA to CSV.
Created heavy_sep_light CSV with 49623 rows.
Converted 49623 sequences from FASTA to CSV.
Created heavy_sep_light CSV with 49623 rows.
