In [8]:
from Bio import SeqIO
import csv

In [9]:
input_file = "transcriptome.pep"
output_file = "proteome.csv"

In [10]:
with open(output_file, 'w', newline='') as csvfile:
    fieldnames = ['id', 'orf_type', 'orientation', 'orf_score', 'sequence', 'length', 'blastp', 'blastp_e-value', 'pfam', 'pfam_e-value']

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for record in SeqIO.parse(input_file, 'fasta'):
        header_parts = record.description.split(' ')
        sequence_id = record.id  # g4311_i1.p1
        sequence_id = sequence_id.split('_')
        sequence_id = ['_'.join(id for id in sequence_id[-2:])]
        orf_type = header_parts[4].split(':')[1]  
        orientation = header_parts[5].split(',')[0] 
        orf_score = header_parts[5].split(',')[1].split('=')[1]
        
        sequence = str(record.seq)
        length = len(sequence)

        blastp = ""
        blastp_evalue = ""
        pfam = ""
        pfam_evalue = ""

        if 'sp|' in header_parts:
            blastp = header_parts[header_parts.index('sp|') + 1]
            blastp_evalue = header_parts[header_parts.index('sp|') + 3][:-1]  # Remove comma

        if 'pfam' in header_parts:
            pfam = header_parts[header_parts.index('pfam') + 1][:-1]  # Remove comma
            pfam_evalue = header_parts[header_parts.index('pfam') + 3]

        writer.writerow({
            'id': sequence_id,
            'orf_type': orf_type,
            'orientation': orientation,
            'orf_score': orf_score,
            'sequence': sequence,
            'length': length,
            'blastp': blastp,
            'blastp_e-value': blastp_evalue,
            'pfam': pfam,
            'pfam_e-value': pfam_evalue
        })