## Load original data

In [None]:
dataset_paths = [
    'data/original/protein-secondary-structure.train',
    'data/original/protein-secondary-structure.test'
]


def process_line(data, protein_dict, line):
    line = line.strip()
    
    # Do nothing for empty lines and comments
    if len(line) == 0 or line.startswith('#'):
        return protein_dict
    
    # End of current protein
    if 'end' in line:
        
        # Only include proteins with >= 20 amino acids
        if len(protein_dict['sequence']) >= 20:
            data.append(protein_dict)       
        return None
    
    # New protein (and possibly end of current protein)
    if line.startswith('<>'):
        if protein_dict is not None and len(protein_dict['sequence']) >= 20:
            data.append(protein_dict) 
        return {'sequence': '', 'labels': ''}
    
    amino_acid, label = line.split()
    protein_dict['sequence'] += amino_acid
    protein_dict['labels'] += label
    return protein_dict


data = []

for dataset_path in dataset_paths:
    with open(dataset_path, 'r') as file:
        protein_dict = None
        for line in file:
            protein_dict = process_line(data, protein_dict, line)

## Save sequences and labels in JSON format

In [24]:
import json
with open('data/data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

## Save sequences in FASTA format

In [25]:
import re

with open('data/data.fasta', 'w') as f:
    for i, item in enumerate(data):
        sequence =  re.sub("(.{80})", "\\1\n", item['sequence'], 0, re.DOTALL)
        f.write(f'>SEQUENCE_{i + 1}\n{sequence}\n')

## Parse output from JPred4

In [37]:
from pathlib import Path

jpred4_dir = Path('data/jpred4')
for output_dir in jpred4_dir.iterdir():
    seq_name = next(f.stem for f in output_dir.iterdir() if f.suffix == '.name')

    dir_name = output_dir.name
    preds_path = output_dir.joinpath(f'{dir_name}.concise')
    
    big_file = output_dir.joinpath(f'{dir_name}.full_MSA.fasta')
    big_file.unlink()