In [None]:
import os
import random
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import pandas as pd

class BenchmarkDatasetExtractor:
    def __init__(self, fasta_file):
        self.fasta_file = fasta_file
        self.sequences = list(SeqIO.parse(fasta_file, "fasta"))

    def analyze_dataset(self):
        """Analyze the dataset characteristics"""
        stats = {
            'total_sequences': len(self.sequences),
            'sequence_lengths': [len(seq.seq) for seq in self.sequences],
            'organisms': set(),
            'protein_families': set()
        }

        for seq in self.sequences:
            # Extract organism information from description
            desc = seq.description.lower()
            if 'pseudomonas' in desc:
                stats['organisms'].add('Pseudomonas')
            elif 'keratin' in desc:
                stats['protein_families'].add('Keratin')
            # Add more classification logic as needed

        stats['avg_length'] = sum(stats['sequence_lengths']) / len(stats['sequence_lengths'])
        stats['min_length'] = min(stats['sequence_lengths'])
        stats['max_length'] = max(stats['sequence_lengths'])

        return stats

    def create_benchmark_subsets(self):
        """Create different benchmark subsets"""

        # 1. Najna18-like subset (18 sequences)
        najna18 = random.sample(self.sequences, min(18, len(self.sequences)))

        # 2. Enzymes30-like subset (30 sequences)
        enzymes30 = random.sample(self.sequences, min(30, len(self.sequences)))

        # 3. Keratin100-like subset (all keratin sequences)
        keratin100 = [seq for seq in self.sequences if 'keratin' in seq.description.lower()]
        if len(keratin100) > 100:
            keratin100 = random.sample(keratin100, 100)

        # 4. BAliBASE-style subsets (different difficulties)
        easy_subset = [seq for seq in self.sequences if 100 <= len(seq.seq) <= 200][:20]
        medium_subset = [seq for seq in self.sequences if 200 < len(seq.seq) <= 400][:20]
        hard_subset = [seq for seq in self.sequences if len(seq.seq) > 400][:20]

        return {
            'Najna18': najna18,
            'Enzymes30': enzymes30,
            'Keratin100': keratin100,
            'BAliBASE_Easy': easy_subset,
            'BAliBASE_Medium': medium_subset,
            'BAliBASE_Hard': hard_subset
        }

    def save_benchmark_datasets(self, output_dir="benchmark_datasets"):
        """Save all benchmark datasets to files"""
        os.makedirs(output_dir, exist_ok=True)

        stats = self.analyze_dataset()
        subsets = self.create_benchmark_subsets()

        # Save dataset statistics
        stats_df = pd.DataFrame([stats])
        stats_df.to_csv(f"{output_dir}/dataset_statistics.csv", index=False)

        # Save individual benchmark files
        for subset_name, sequences in subsets.items():
            if sequences:  # Only save non-empty subsets
                filename = f"{output_dir}/{subset_name}.fasta"
                with open(filename, "w") as f:
                    SeqIO.write(sequences, f, "fasta")
                print(f"Saved {subset_name}: {len(sequences)} sequences")

        # Save the full dataset
        full_filename = f"{output_dir}/Full_Dataset.fasta"
        with open(full_filename, "w") as f:
            SeqIO.write(self.sequences, f, "fasta")

        print(f"\nDataset Analysis:")
        print(f"Total sequences: {stats['total_sequences']}")
        print(f"Average length: {stats['avg_length']:.2f}")
        print(f"Length range: {stats['min_length']} - {stats['max_length']}")
        print(f"Organisms: {list(stats['organisms'])}")

        return subsets

# Usage
if __name__ == "__main__":
    extractor = BenchmarkDatasetExtractor("keratin_sequences.fasta")
    datasets = extractor.save_benchmark_datasets()