In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q ankh

In [None]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict
import random

seed = 7

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

import ankh

from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

from transformers import Trainer, TrainingArguments, EvalPrediction
from datasets import load_dataset

# from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy import stats
from functools import partial
import pandas as pd
from tqdm.auto import tqdm


# Load Model and Data

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Available device:', device)
def check_len(some_list):
    for i in some_list:
        print(len(i))


def get_num_params(model):
    return sum(p.numel() for p in model.parameters())


In [None]:
model, tokenizer = ankh.load_large_model()
model.eval()
model.to(device=device)
print(f"Number of parameters:", get_num_params(model))


In [None]:
def extract_sequence_id(index_name):
    """
    Extract sequence ID from index like 'sequence_0_window_1'
    
    Parameters:
    -----------
    index_name : str
        Index name in format 'sequence_[id]_window_[window_id]'
    
    Returns:
    --------
    str
        The sequence ID (e.g., '0' from 'sequence_0_window_1')
    """
    # Use regex to extract sequence ID
    match = re.search(r'sequence_(\d+)_window_\d+', index_name)
    if match:
        return match.group(1)
    else:
        # Alternative pattern matching if format is different
        parts = index_name.split('_')
        if len(parts) >= 4 and parts[0] == 'sequence':
            return parts[1]
    return None

def sequence_based_split(df, val_size=0.2, random_state=42):
    """
    Split dataframe based on sequence IDs to prevent data leakage.
    All windows from the same sequence will be in the same split.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with index containing sequence and window information
    test_size : float
        Proportion of sequences for test set
    val_size : float
        Proportion of remaining sequences for validation set
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    tuple
        (train_df, val_df, test_df, split_info)
    """
    # Extract sequence IDs from index
    sequence_ids = []
    for idx in df['id']:
        seq_id = extract_sequence_id(str.lower(idx))
        if seq_id is not None:
            sequence_ids.append(seq_id)
        else:
            raise ValueError(f"Could not extract sequence ID from index: {idx}")
    
    # Add sequence IDs to dataframe (temporary)
    df_temp = df.copy()
    df_temp['sequence_id'] = sequence_ids
    
    # Get unique sequence IDs
    unique_sequences = list(set(sequence_ids))
    print(f"Total unique sequences: {len(unique_sequences)}")
    
    # Count windows per sequence
    sequence_counts = defaultdict(int)
    for seq_id in sequence_ids:
        sequence_counts[seq_id] += 1
    
    print(f"Windows per sequence: min={min(sequence_counts.values())}, "
          f"max={max(sequence_counts.values())}, "
          f"mean={np.mean(list(sequence_counts.values())):.1f}")
    
    # First split: separate test sequences
    train_sequences, val_sequences = train_test_split(
        unique_sequences, 
        test_size=val_size, 
        random_state=random_state
    )
    
    # Create splits based on sequence membership
    train_df = df_temp[df_temp['sequence_id'].isin(train_sequences)].drop('sequence_id', axis=1)
    val_df = df_temp[df_temp['sequence_id'].isin(val_sequences)].drop('sequence_id', axis=1)
    
    # Create split information
    split_info = {
        'train_sequences': sorted(train_sequences),
        'val_sequences': sorted(val_sequences),
        'train_windows': len(train_df),
        'val_windows': len(val_df),
        'total_windows': len(df)
    }
    
    print(f"Split results:")
    print(f"  Train: {len(train_sequences)} sequences, {len(train_df)} windows")
    print(f"  Val:   {len(val_sequences)} sequences, {len(val_df)} windows") 
    
    return train_df, val_df, split_info

def verify_no_leakage(train_df, val_df):
    """
    Verify that there's no sequence leakage between splits.
    
    Parameters:
    -----------
    train_df, val_df, test_df : pandas.DataFrame
        The split dataframes
    
    Returns:
    --------
    bool
        True if no leakage detected, False otherwise
    """
    # Extract sequence IDs from each split
    train_seqs = set([extract_sequence_id(str(idx)) for idx in train_df.index])
    val_seqs = set([extract_sequence_id(str(idx)) for idx in val_df.index])
    
    # Check for overlaps
    train_val_overlap = list(train_seqs.intersection(val_seqs))
    
    if len(train_val_overlap) > 1 or train_val_overlap[0] is not None:
        print("❌ DATA LEAKAGE DETECTED!")
        if train_val_overlap:
            print(f"  Train-Val overlap: {train_val_overlap}")
        return False
    else:
        print("✅ No data leakage detected. All sequences are properly separated.")
        return True

# Example usage and demonstration
def split_dataset_process(df):
    """Demonstrate the sequence-based splitting functionality"""
    
    # Test regular sequence-based split
    print("=== Regular Sequence-Based Split Process ===")
    train_df, val_df, split_info = sequence_based_split(
        df, val_size=0.25, random_state=42
    )
    
    print()
    verify_no_leakage(train_df, val_df)
    print()

    
    return train_df, val_df, split_info



In [None]:
# df_pad = pd.read_csv('/kaggle/input/preprocessed-v4/PS4_108cut_Ankh.csv')
# # df_pad = pd.read_csv('/kaggle/input/preprocessed-v4/PS4_108cut.csv')
# print("=== PS4 Dataset ===")
# print(df_pad.info())
# df_pad['len_seq_aa'] = df_pad['aa_sequence'].apply(lambda x : len(str(x)))
# df_pad['len_ssp8'] = df_pad['ssp_sequence'].apply(lambda x : len(str(x)))
# print(df_pad.describe())

# train_df, val_df, split_info = split_dataset_process(df_pad)
# print("=== Training PS4 Dataset ===")
# print(train_df.info())
# print("=== Validation PS4 Dataset ===")
# print(val_df.info())

# df_cb433 = pd.read_csv('/kaggle/input/preprocessed-v4/CB433_108cut_Ankh.csv')
# # df_cb433 = pd.read_csv('/kaggle/input/preprocessed-v4/CB433_108cut.csv')
# print("=== Test CB433 Dataset ===")
# print(df_cb433.info())
# df_cb433['len_seq_aa'] = df_cb433['aa_sequence'].apply(lambda x : len(str(x)))
# df_cb433['len_ssp8'] = df_cb433['ssp_sequence'].apply(lambda x : len(str(x)))
# print(df_cb433.describe())

In [None]:
def wrong_eos(df):
    # --- Step 1: Fix sequences ending with '<', '/', 's' ---
    df = df.copy()  # Avoid SettingWithCopyWarning
    mask = df['aa_sequence'].str[-1].isin(['<', '/', 's'])

    # Apply replacement based on last character
    ends_with_s = mask & (df['aa_sequence'].str[-1] == 's')
    ends_with_slash = mask & (df['aa_sequence'].str[-1] == '/')
    ends_with_lt = mask & (df['aa_sequence'].str[-1] == '<')

    # Use .loc to safely assign new values
    df.loc[ends_with_s, 'aa_sequence'] = df.loc[ends_with_s, 'aa_sequence'].str[:-3] + '<eos>'
    df.loc[ends_with_slash, 'aa_sequence'] = df.loc[ends_with_slash, 'aa_sequence'].str[:-2] + '<eos>'
    df.loc[ends_with_lt, 'aa_sequence'] = df.loc[ends_with_lt, 'aa_sequence'].str[:-1] + '<eos>'

    # --- Step 2: Drop rows where ssp_sequence is all 'X' ---
    mask_ssp_all_x = df['ssp_sequence'].str.fullmatch(r'^X+$')
    df = df[~mask_ssp_all_x].reset_index(drop=True)

    return df

In [None]:
train_df =  pd.read_csv('/kaggle/input/preprocess-f/Train_PS4_Ankh.csv')
train_df['len_seq_aa'] = train_df['aa_sequence'].apply(lambda x : len(str(x)))
train_df['len_ssp8'] = train_df['ssp_sequence'].apply(lambda x : len(str(x)))
print("=== Training PS4 Dataset ===")
print(train_df.info())
val_df =  pd.read_csv('/kaggle/input/preprocess-f/Val_PS4_Ankh.csv')
val_df['len_seq_aa'] = val_df['aa_sequence'].apply(lambda x : len(str(x)))
val_df['len_ssp8'] = val_df['ssp_sequence'].apply(lambda x : len(str(x)))
print("=== Validation PS4 Dataset ===")
print(val_df.info())

df_cb433 = pd.read_csv('/kaggle/input/preprocess-f/CB433_Ankh.csv')
# df_cb433 = pd.read_csv('/kaggle/input/preprocessed-v4/CB433_108cut.csv')
print("=== Test CB433 Dataset ===")
print(df_cb433.info())
df_cb433['len_seq_aa'] = df_cb433['aa_sequence'].apply(lambda x : len(str(x)))
df_cb433['len_ssp8'] = df_cb433['ssp_sequence'].apply(lambda x : len(str(x)))
print(df_cb433.describe())

In [None]:
# replace J as unknown
train_df['aa_sequence'] = train_df['aa_sequence'].str.replace('J', 'X')
val_df['aa_sequence'] = val_df['aa_sequence'].str.replace('J', 'X')
df_cb433['aa_sequence']= df_cb433['aa_sequence'].str.replace('J', 'X')

In [None]:
# val_df = pd.read_csv('/kaggle/input/preprocessed-v4/Val_PS4.csv')
dfs = [
# train_df,
       val_df,
df_cb433]
# dfs = [val_df, df_cb433]

# train_df.to_csv('Train_PS4.csv')
# val_df.to_csv('Val_PS4.csv')

# Processing Dataset

In [None]:
def tokenize_sequence(sequence: str):
        # Define the special tokens
        # special_tokens = {"<cls>", "<eos>", "<pad>"}
        special_tokens = ["</s>", "<pad>"]
        # special_tokens = {"[CLS]", "[PAD]"}
        result = []
        i = 0
        n = len(sequence)
        
        while i < n:
            # Check if the current position starts a special token
            if sequence[i:i+4] in special_tokens[0]:  # Handle </s>
                result.append(sequence[i:i+4])
                i += 4  # Skip the length of the special token
            elif sequence[i:i+5] in special_tokens[1]:  # Handle <pad>
                result.append(sequence[i:i+5])
                i += 5  # Skip the length of the special token
            else:
                # Add individual characters
                result.append(sequence[i])
                i += 1
    
        return result
    
def preprocessed(df_col):
    processed_sequences = []
    for seq in df_col:
        # prep_seq = list(seq)
        prep_seq = tokenize_sequence(seq)
        processed_sequences.append(prep_seq)
    return processed_sequences
    
def preprocess_dataset(sequences, labels, 
                       # disorder, 
                       max_length=None, st = True):
    
    if st:
        sequences = ["".join(str(seq).split()) for seq in sequences]
    else:
        sequences = preprocessed(sequences)
    
    if max_length is None:
        max_length = len(max(sequences, key=lambda x: len(x)))

    seqs = [list(seq)[:max_length] for seq in sequences]
    
    labels = ["".join(str(label).split()) for label in labels]
    labels = [list(label)[:max_length] for label in labels]
    
    # disorder = [" ".join(disorder.split()) for disorder in disorder]
    # disorder = [disorder.split()[:max_length] for disorder in disorder]
    
    assert len(seqs) == len(labels)
    return seqs, labels


def embed_dataset(model, sequences, 
                  # tokenizer, device, 
                  shift_left=0, shift_right=-1, max_length=108, st_status=True):
    """
    Embed sequences using a pre-trained model and adjust the attention mask to only include 
    actual content (set to 0 after the last non-padding token).
    
    Args:
        model: Pre-trained model for embedding the sequences
        sequences: List of sequences to embed
        tokenizer: Tokenizer to convert sequences to token IDs
        device: Device to run the model on (e.g., 'cuda' or 'cpu')
        shift_left: Number of tokens to exclude from the beginning
        shift_right: Number of tokens to exclude from the end (negative indexing)
        max_length: Maximum sequence length for padding/truncation
        st_status: Whether to add special tokens
        
    Returns:
        Tuple of (inputs_id, inputs_embedding, adjusted_attention_masks)
    """
    inputs_embedding = []
    inputs_id = []
    adjusted_attention_masks = []
    
    with torch.no_grad():
        for sample in tqdm(sequences):
            # Tokenize the input
            ids = tokenizer.batch_encode_plus(
                [sample], 
                add_special_tokens=st_status, 
                max_length=max_length,
                padding='max_length', 
                is_split_into_words=True, 
                truncation=True, 
                return_tensors="pt"
            )
            
            # Create a new attention mask based on the actual content
            input_ids = ids['input_ids'][0].cpu().numpy()
            
            # Find the padding token ID (usually 0 or 1 depending on the tokenizer)
            padding_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
            
            # Find the position of the last non-padding token
            non_padding_positions = np.where(input_ids != padding_token_id)[0]
            
            if len(non_padding_positions) > 0:
                # If there are non-padding tokens, find the last one
                last_token_pos = non_padding_positions[-1]
                
                # Create a new attention mask: 1s up to and including the last token, 0s after
                adjusted_mask = np.zeros(max_length, dtype=np.int64)
                adjusted_mask[:last_token_pos + 1] = 1
            else:
                # If all tokens are padding tokens (unlikely but possible)
                adjusted_mask = np.zeros(max_length, dtype=np.int64)
            
            # Convert to tensor
            ids['attention_mask'][0] = torch.tensor(adjusted_mask)
            
            # Generate embeddings
            embedding = model(input_ids=ids['input_ids'].to(device))[0]
            
            # # Apply shifts if needed
            # if shift_right == -1:
            #     embedding_np = embedding[0].detach().cpu().numpy()[shift_left:]
            # else:
            #     embedding_np = embedding[0].detach().cpu().numpy()[shift_left:shift_right]
            
            inputs_embedding.append(embedding)
            # inputs_id.append(ids)
            # adjusted_attention_masks.append(adjusted_mask)
    
    return inputs_embedding


# Processing Labels

In [None]:
# Consider each label as a tag for each token
# training_sequences, training_labels = preprocess_dataset(df_pad['aa_sequence'], 
#                                                         df_pad['ssp_sequence'], 
#                                                          st = False
#                                                         )
# unique_tags = set(tag for doc in training_labels for tag in doc)
# unique_tags.remove('X')
# tag2id = {tag: id for id, tag in enumerate(unique_tags)}
# id2tag = {id: tag for tag, id in tag2id.items()}
# del training_sequences, training_labels
# print(tag2id)

{'T': 0, 'S': 1, 'G': 2, 'P': 3, 'L': 4, 'E': 5, 'B': 6, 'H': 7, 'I': 8}

In [None]:

def encode_tags(labels, max_length=108, padding_value = -100):
    """
    Encode tags to IDs and pad to a fixed length.
    
    Args:
        labels: List of lists, where each inner list contains tags for a document
        max_length: Maximum length for padding (default: 108)
    
    Returns:
        List of lists with tag IDs and padding
    """
    encoded_labels = []
    
    for doc in labels:
        # Convert tags to IDs, skip unknown tags
        doc_ids = [tag2id.get(tag, padding_value) for tag in doc]  # Use -100 for unknown tags
        
        # Truncate if longer than max_length
        if len(doc_ids) > max_length:
            doc_ids = doc_ids[:max_length]
        # Add padding if shorter than max_length
        else: 
            padding = [padding_value] * (max_length - len(doc_ids))
            doc_ids = doc_ids + padding
            
        encoded_labels.append(doc_ids)
    
    return encoded_labels

In [None]:

def tensor_to_numpy(tensor):
    """Safely convert tensor to numpy array"""
    # Check if it's a PyTorch tensor
    if torch.is_tensor(tensor):
        return tensor.detach().cpu().numpy()
    elif isinstance(tensor, np.ndarray):
        return tensor
    elif isinstance(tensor, list):
        # Handle list of tensors
        converted_list = []
        for item in tensor:
            if torch.is_tensor(item):
                converted_list.append(item.detach().cpu().numpy())
            else:
                converted_list.append(item)
        return np.array(converted_list)
    else:
        # For other types, convert to numpy array safely
        try:
            return np.array(tensor)
        except Exception as e:
            print(f"Error converting {type(tensor)} to numpy: {e}")
            raise

def memory_efficient_processing(model, df, tag2id, label, chunk_size=1000):
    total_chunks = (len(df) - 1) // chunk_size + 1
    results = []
    
    with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
        input_column_name = 'aa_sequence'
        labels_column_name = 'ssp_sequence'
        padding_value = -100
        # Create output directory
        output_dir = './embedding_Ankh_result/'
        os.makedirs(output_dir, exist_ok=True)

        for i in range(0, len(df), chunk_size):
            # Get chunk
            chunk_num = i // chunk_size
            chunk = df.iloc[i:i + chunk_size].copy()
            print(f"Processing chunk {chunk_num + 1}/{(len(df)-1)//chunk_size + 1}")

            #preprocessing dataset
            training_sequences = chunk[input_column_name]
            training_labels = chunk[labels_column_name]

            training_sequences, training_labels = preprocess_dataset(training_sequences, 
                                                        training_labels, 
                                                         st = False
                                                        )
             # Clear memory
            del chunk
            # Process embeddings
            training_embeddings = embed_dataset(model, 
                                                training_sequences,
                                                st_status = False)
             # Clear memory
            del training_sequences
            # Save to NPZ
            npz_path = os.path.join(output_dir, f'{label}_sequence_chunk_{chunk_num:04d}.npz')
            
            # Save embeddings and any metadata
            np.savez_compressed(
                npz_path,
                embeddings=tensor_to_numpy(training_embeddings),
                # indices=chunk.index.values,  # Original indices
                chunk_start=i,
                chunk_end=min(i + chunk_size, len(df))
            )
            del training_embeddings
            
            #process label encoding
            train_labels_encodings = torch.tensor(encode_tags(training_labels, padding_value = padding_value))
            train_tensor_enc = F.one_hot(train_labels_encodings.clamp(min=0), num_classes=9)
            # Replace one-hot vectors for padding values with all zeros
            # This creates a mask where True indicates padding positions
            padding_mask = (train_labels_encodings == padding_value).unsqueeze(-1)
            # Zero out the one-hot vectors at padding positions
            train_tensor_enc = train_tensor_enc.masked_fill(padding_mask, 0)
            print(f"Final shape: {train_tensor_enc.shape}") 

            del training_labels

            # Save to NPZ
            npz_path = os.path.join(output_dir, f'{label}_labels_chunk_{chunk_num:04d}.npz')
            
            # Save embeddings and any metadata
            np.savez_compressed(
                npz_path,
                embeddings=tensor_to_numpy(train_tensor_enc),
                # indices=chunk.index.values,  # Original indices
                chunk_start=i,
                chunk_end=min(i + chunk_size, len(df))
            )
            # Clear memory
            del train_labels_encodings, train_tensor_enc, padding_mask
            
            # Update progress
            pbar.update(1)

In [None]:
tag2id = {'S': 0, 'H': 1, 'B': 2, 'I': 3, 'L': 4, 'P': 5, 'G': 6, 'E': 7, 'T': 8}

In [None]:
label_dfs = ['train', 'val', 'test']
# label_dfs = ['val']
for i in range(len(dfs)):
    memory_efficient_processing(model, dfs[i], tag2id, label = label_dfs[i], chunk_size=5000)

In [None]:
def merge_npz_files(input_pattern, output_file, merge_arrays=True):
    """
    Merge multiple NPZ files into one
    
    Args:
        input_pattern: Pattern to match files (e.g., "train_*.npz" or list of filenames)
        output_file: Output NPZ filename
        merge_arrays: If True, concatenate arrays with same keys. If False, keep separate with indices
    """
    # Get list of files
    if isinstance(input_pattern, str):
        npz_files = sorted(glob.glob(input_pattern))
    else:
        npz_files = input_pattern
    
    print(f"Found {len(npz_files)} NPZ files to merge:")
    for f in npz_files:
        print(f"  - {f}")
    
    if not npz_files:
        print("No NPZ files found!")
        return
    
    # Load first file to get structure
    with np.load(npz_files[0]) as first_file:
        array_names = list(first_file.keys())
        print(f"Array keys found: {array_names}")
    
    if merge_arrays:
        # Method 1: Concatenate arrays with same keys
        merged_data = {}
        
        for array_name in array_names:
            arrays_to_merge = []
            
            for npz_file in npz_files:
                with np.load(npz_file) as data:
                    if array_name in data:
                        arrays_to_merge.append(data[array_name])
                        print(f"Loading {array_name} from {npz_file}: shape {data[array_name].shape}")
            
            if arrays_to_merge:
                # Concatenate along first axis (assuming batch dimension)
                merged_array = np.concatenate(arrays_to_merge, axis=0)
                merged_data[array_name] = merged_array
                print(f"Merged {array_name}: final shape {merged_array.shape}")
        
        # Save merged data
        np.savez_compressed(output_file, **merged_data)
        print(f"Saved merged data to {output_file}")
        
    else:
        # Method 2: Keep arrays separate with file indices
        merged_data = {}
        
        for i, npz_file in enumerate(npz_files):
            with np.load(npz_file) as data:
                for array_name in data.keys():
                    key = f"{array_name}_file_{i}"
                    merged_data[key] = data[array_name]
                    print(f"Added {key}: shape {data[array_name].shape}")
        
        # Save with indexed keys
        np.savez_compressed(output_file, **merged_data)
        print(f"Saved indexed data to {output_file}")




In [None]:
# output_dir = '/kaggle/working/embedding_Ankh_result/'
# """Merge specific files by name"""
# files_to_merge = [
#         # output_dir + "train_labels_chunk_0000.npz",
#         # output_dir + "train_labels_chunk_0001.npz", 
#         output_dir + "train_sequence_chunk_0000.npz",
#         output_dir + "train_sequence_chunk_0001.npz"
#     ]

# merge_npz_files(files_to_merge, "merged_specific.npz", merge_arrays=False)

In [None]:
import numpy as np
