In [None]:
import pandas as pd
import os
from collections import Counter
import numpy as np
import pickle

## Load the data

In [None]:
train_terms_path = os.path.join('..', 'data', 'Train', 'train_terms.tsv')
train_terms_df = pd.read_csv(train_terms_path, sep='\t')
display(train_terms_df.head())

In [None]:
def parse_fasta(file_path):
    sequences = {}
    current_protein = None
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                current_protein = line[1:]
                sequences[current_protein] = ''
            elif current_protein:
                sequences[current_protein] += line
    return sequences

train_sequences_path = os.path.join('..', 'data', 'Train', 'train_sequences.fasta')
train_sequences = parse_fasta(train_sequences_path)
train_sequences_df = pd.DataFrame(train_sequences.items(), columns=['EntryID', 'Sequence'])
display(train_sequences_df.head())

## Merge dataframes

In [None]:
merged_df = pd.merge(train_sequences_df, train_terms_df, on='EntryID')
display(merged_df.head())

## Feature Engineering: k-mers

In [None]:
def get_kmers(sequence, k=3):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

merged_df['kmers'] = merged_df['Sequence'].apply(lambda x: get_kmers(x))
display(merged_df.head())

## Create labels

In [None]:
labels_df = merged_df.groupby('EntryID')['term'].apply(list).reset_index()
display(labels_df.head())

## Combine features and labels and save

In [None]:
processed_df = pd.merge(merged_df.drop(columns=['term']), labels_df, on='EntryID').drop_duplicates(subset=['EntryID']).reset_index(drop=True)
display(processed_df.head())

output_path = os.path.join('..', 'data', 'processed_data.pkl')
with open(output_path, 'wb') as f:
    pickle.dump(processed_df, f)
print(f"Processed data saved to {output_path}")