## Step 1 : Imports

In [1]:
import pandas as pd

## Step 2: Load the oof files

In [2]:
exp_1_oof = pd.read_csv("/kaggle/input/train-ribonanzanet-inference-03/ribonanzanet1_oof.csv")
exp_2_oof = pd.read_csv("/kaggle/input/train-rhofold-inference/RhoFold/rhofold_01_oof.csv")

In [3]:
exp_1_oof.shape, exp_2_oof.shape

((59508, 23), (47857, 22))

In [4]:
exp_1_oof.columns, exp_2_oof.columns

(Index(['Unnamed: 0', 'ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1', 'x_2',
        'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5',
        'z_5', 'flag', 'x_1_true', 'y_1_true', 'z_1_true'],
       dtype='object'),
 Index(['Unnamed: 0', 'ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1', 'x_2',
        'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5',
        'z_5', 'x_1_true', 'y_1_true', 'z_1_true'],
       dtype='object'))

## Step -1: Hill climbing code

In [5]:
import gc
import os
import re
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

In [6]:
def setup_usalign():
    """
    Sets up USalign by copying it to the working directory and making it executable.
    """
    # Copy USalign to working directory if it doesn't exist
    if not os.path.exists('/kaggle/working/USalign'):
        os.system('cp //kaggle/input/usalign/USalign /kaggle/working/')
        os.system('sudo chmod u+x /kaggle/working/USalign')
        print("USalign binary copied and made executable")
    return '/kaggle/working/USalign'

In [7]:
def parse_tmscore_output(output):
    # Extract TM-score based on length of reference structure (second)
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    if not tm_score_match:
        raise ValueError('No TM score found')
    return float(tm_score_match)


def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num, x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    """
    Writes a single line of PDB format based on provided atom information.
    """
    return f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} {residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'

In [8]:
def write2pdb(df: pd.DataFrame, xyz_id: str, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as target_file:
        for _, row in df.iterrows():
            x_coord = row[f'x_{xyz_id}']
            y_coord = row[f'y_{xyz_id}']
            z_coord = row[f'z_{xyz_id}']

            if x_coord > -1e17 and y_coord > -1e17 and z_coord > -1e17:
                resolved_cnt += 1
                target_line = write_target_line(
                    atom_name="C1'",
                    atom_serial=int(row['resid']),
                    residue_name=row['resname'],
                    chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x_coord,
                    y_coord=y_coord,
                    z_coord=z_coord,
                    atom_type='C',
                )
                target_file.write(target_line)
    return resolved_cnt

In [9]:
def write_ground_truth_pdb(df: pd.DataFrame, target_path: str) -> int:
    """
    Writes ground truth coordinates to a PDB file.
    """
    resolved_cnt = 0
    with open(target_path, 'w') as target_file:
        for _, row in df.iterrows():
            x_coord = row['x_1_true']
            y_coord = row['y_1_true']
            z_coord = row['z_1_true']

            if x_coord > -1e17 and y_coord > -1e17 and z_coord > -1e17:
                resolved_cnt += 1
                target_line = write_target_line(
                    atom_name="C1'",
                    atom_serial=int(row['resid']),
                    residue_name=row['resname'],
                    chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x_coord,
                    y_coord=y_coord,
                    z_coord=z_coord,
                    atom_type='C',
                )
                target_file.write(target_line)
    return resolved_cnt

In [10]:
def evaluate_single_weight(args):
    """
    Evaluates a single weight combination for ensembling two RNA structure predictions.
    
    Args:
        args (tuple): Contains weight, first_pred_df, second_pred_df, temporary_dir, and usalign_path
        
    Returns:
        tuple: weight and resulting score
    """
    weight, first_pred_df, second_pred_df, temp_dir, usalign_path = args
    
    # Create a copy of the first prediction dataframe
    ensemble_df = first_pred_df.copy()
    
    # For each prediction (1-5), blend the coordinates using the weight
    for pred_idx in range(1, 6):
        # Blend x coordinates
        ensemble_df[f'x_{pred_idx}'] = (1 - weight) * first_pred_df[f'x_{pred_idx}'] + weight * second_pred_df[f'x_{pred_idx}']
        # Blend y coordinates
        ensemble_df[f'y_{pred_idx}'] = (1 - weight) * first_pred_df[f'y_{pred_idx}'] + weight * second_pred_df[f'y_{pred_idx}']
        # Blend z coordinates
        ensemble_df[f'z_{pred_idx}'] = (1 - weight) * first_pred_df[f'z_{pred_idx}'] + weight * second_pred_df[f'z_{pred_idx}']
    
    # Create unique filenames for this process to avoid conflicts in parallel processing
    process_id = os.getpid()
    native_pdb = f'{temp_dir}/native_{process_id}.pdb'
    predicted_pdb = f'{temp_dir}/predicted_{process_id}.pdb'
    
    # Evaluate using TM-score
    score = score_ensemble(ensemble_df, native_pdb, predicted_pdb, usalign_path)
    
    return weight, score

In [11]:
from tqdm import tqdm

def score_ensemble(ensemble_df, native_pdb='native.pdb', predicted_pdb='predicted.pdb', usalign_path=None):
    """
    Scores the ensemble predictions against the ground truth using TM-score.
    
    Args:
        ensemble_df (pd.DataFrame): DataFrame with predicted coordinates and ground truth
        native_pdb (str): Path to write the native PDB file
        predicted_pdb (str): Path to write the predicted PDB file
        usalign_path (str): Path to the USalign executable
        
    Returns:
        float: Average TM-score across all targets
    """
    if usalign_path is None:
        usalign_path = '/kaggle/working/USalign'
    
    from contextlib import contextmanager
    
    @contextmanager
    def no_stdout():
        old_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
        try:
            yield
        finally:
            sys.stdout = old_stdout
    
    # Extract target_id from ID (target_resid)
    ensemble_df['target_id'] = ensemble_df['ID'].apply(lambda x: x.split('_')[0])
    
    results = []
    # Iterate through each target_id and compute TM-score
    unique_targets = ensemble_df['target_id'].unique()
    for target_id in tqdm(unique_targets):
        group_data = ensemble_df[ensemble_df['target_id'] == target_id]
        
        # Write ground truth PDB once per target
        with no_stdout():
            resolved_cnt_truth = write_ground_truth_pdb(group_data, native_pdb)
        
        if resolved_cnt_truth == 0:
            continue  # Skip if no ground truth coordinates
        
        target_id_scores = []
        for pred_cnt in range(1, 6):
            # Write predicted PDB
            with no_stdout():
                resolved_cnt_pred = write2pdb(group_data, pred_cnt, predicted_pdb)
            
            if resolved_cnt_pred > 0:
                try:
                    command = f'{usalign_path} {predicted_pdb} {native_pdb} -atom " C1\'"'
                    usalign_output = os.popen(command).read()
                    tm_score = parse_tmscore_output(usalign_output)
                    target_id_scores.append(tm_score)
                except Exception as e:
                    # Silent error handling during optimization
                    continue
        
        if target_id_scores:
            results.append(max(target_id_scores))
    
    # Return average TM-score across all targets
    if not results:
        return 0.0
    return float(sum(results) / len(results))

In [12]:
def hill_climbing_optimizer(experiments_list, weight_step=0.05, max_iterations=50, num_cores=None):
    """
    Performs hill climbing optimization to find the best ensemble weights for RNA structure predictions.
    
    Args:
        experiments_list (list): List of dataframes with predictions
        weight_step (float): Step size for weight exploration
        max_iterations (int): Maximum number of iterations
        num_cores (int): Number of CPU cores to use for parallelization
        
    Returns:
        tuple: Dictionary of weights, best score, and final ensemble dataframe
    """
    # Set up USalign
    usalign_path = setup_usalign()
    
    if num_cores is None:
        num_cores = max(1, multiprocessing.cpu_count() - 1)
    
    # Create temporary directory for parallel processing
    temp_dir = '/tmp/rna_ensemble'
    os.makedirs(temp_dir, exist_ok=True)
    
    # Start with the first experiment as baseline
    best_ensemble_df = experiments_list[0].copy()
    remaining_experiments = experiments_list[1:].copy()
    
    # Dictionary to store weights for each experiment
    model_weights = {}
    
    # Define the weight range to explore
    weights_range = np.arange(0.0, 1.0 + weight_step, weight_step)
    print(f"Number of weights testing : {len(weights_range)}")
    
    # Get initial score
    initial_score = score_ensemble(best_ensemble_df, usalign_path=usalign_path)
    best_score = initial_score
    print(f"Initial score: {best_score:.6f}")
    
    iteration = 0
    while remaining_experiments and iteration < max_iterations:
        iteration += 1
        print(f"\nIteration {iteration}/{max_iterations}")
        
        best_iteration = {
            'index': -1,
            'weight': 0,
            'score': best_score,
            'df': None
        }
        
        # Try each remaining model
        for i, model_df in enumerate(remaining_experiments):
            try:
                print(f"Testing model {i+1}/{len(remaining_experiments)}")
                
                # Prepare arguments for parallel processing
                args_list = [(weight, best_ensemble_df, model_df, temp_dir, usalign_path) 
                             for weight in weights_range]
                
                # Process weights in parallel
                with ProcessPoolExecutor(max_workers=num_cores) as executor:
                    futures = [executor.submit(evaluate_single_weight, args) for args in args_list]
                    
                    # Process results as they complete
                    for future in tqdm(as_completed(futures), total=len(futures), 
                                      desc=f"Testing weights for model {i+1}"):
                        weight, new_score = future.result()
                        
                        # Update best if improved
                        if new_score > best_iteration['score']:
                            # Create the ensemble for this weight
                            tmp_ensemble_df = best_ensemble_df.copy()
                            for pred_idx in range(1, 6):
                                tmp_ensemble_df[f'x_{pred_idx}'] = (1 - weight) * best_ensemble_df[f'x_{pred_idx}'] + weight * model_df[f'x_{pred_idx}']
                                tmp_ensemble_df[f'y_{pred_idx}'] = (1 - weight) * best_ensemble_df[f'y_{pred_idx}'] + weight * model_df[f'y_{pred_idx}']
                                tmp_ensemble_df[f'z_{pred_idx}'] = (1 - weight) * best_ensemble_df[f'z_{pred_idx}'] + weight * model_df[f'z_{pred_idx}']
                            
                            best_iteration.update({
                                'index': i,
                                'weight': weight,
                                'score': new_score,
                                'df': tmp_ensemble_df.copy()
                            })
                
                # Clean up
                gc.collect()
                
            except Exception as e:
                print(f"Error processing model {i+1}: {str(e)}")
                continue
        
        # Check if we found an improvement
        if best_iteration['index'] == -1:
            print("No improvement found, stopping")
            break
            
        # Update ensemble with best model found
        best_score = best_iteration['score']
        best_ensemble_df = best_iteration['df'].copy()
        
        # Add weight to the selected model and remove it from remaining
        selected_model = remaining_experiments.pop(best_iteration['index'])
        model_weights[f"model_{iteration}"] = best_iteration['weight']
        
        print(f"Added model with weight {best_iteration['weight']:.4f}, New score: {best_iteration['score']:.6f}")
        
    print("\nOptimization complete!")
    print(f"Initial score: {initial_score:.6f}")
    print(f"Final score: {best_score:.6f}")
    print(f"Improvement: {best_score - initial_score:.6f} ({(best_score - initial_score) / initial_score * 100:.2f}%)")
    print(f"Model weights: {model_weights}")
    
    # Clean up temporary directory
    for filename in os.listdir(temp_dir):
        os.remove(os.path.join(temp_dir, filename))
    os.rmdir(temp_dir)
    
    return model_weights, best_score, best_ensemble_df

In [13]:
# Load your experiment dataframes
experiment1_df = exp_1_oof
experiment2_df = exp_2_oof

# Create list of experiment dataframes
experiments_list = [experiment1_df, experiment2_df]

# Run the hill climbing optimizer
model_weights, best_score, final_ensemble = hill_climbing_optimizer(
    experiments_list=experiments_list,
    weight_step=0.1,
    max_iterations=10,
    num_cores=4
)

USalign binary copied and made executable
Number of weights testing : 11


 50%|█████     | 361/716 [04:20<56:07,  9.49s/it]  Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
 67%|██████▋   | 478/716 [04:50<02:02,  1.94it/s]Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
Sequence is too short <3!: predicted.pdb
100%|██████████| 716/716 [07:13<00:00,  1.65it/s]

Initial score: 0.251968

Iteration 1/10
Testing model 1/1



 51%|█████     | 363/716 [07:10<59:53, 10.18s/it]  Sequence is too short <3!: /tmp/rna_ensemble/predicted_7204.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7204.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7204.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7204.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7204.pdb
Segmentation fault (core dumped)
 51%|█████▏    | 367/716 [07:11<19:51,  3.41s/it]Sequence is too short <3!: /tmp/rna_ensemble/predicted_7205.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7205.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7205.pdb
Segmentation fault (core dumped)
Sequence is too short <3!: /tmp/rna_ensemble/predicted_7205.pdb
Segmentation fault (core dumped)
Sequence is too short <3!:

Added model with weight 0.0000, New score: 0.258163

Optimization complete!
Initial score: 0.251968
Final score: 0.258163
Improvement: 0.006195 (2.46%)
Model weights: {'model_1': 0.0}





In [14]:
model_weights, best_score, final_ensemble

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


({'model_1': 0.0},
 0.2581630434782607,
        Unnamed: 0         ID resname  resid       x_1        y_1        z_1  \
 0               0   1SCL_A_1       G      1 -89.44650  440.98752 -191.07695   
 1               1   1SCL_A_2       G      2 -86.84754  444.70746 -187.42662   
 2               2   1SCL_A_3       G      3 -84.42002  450.20407 -186.49513   
 3               3   1SCL_A_4       U      4 -82.65829  454.56104 -187.05147   
 4               4   1SCL_A_5       G      5 -82.31447  459.51068 -189.50598   
 ...           ...        ...     ...    ...       ...        ...        ...   
 59503       64413  8Z1F_T_62       U     62       NaN        NaN        NaN   
 59504       64414  8Z1F_T_63       A     63       NaN        NaN        NaN   
 59505       64415  8Z1F_T_64       C     64       NaN        NaN        NaN   
 59506       64416  8Z1F_T_65       C     65       NaN        NaN        NaN   
 59507       64417  8Z1F_T_66       A     66       NaN        NaN        NaN   


In [15]:
# Save the final ensemble
final_ensemble.to_csv('final_ensemble.csv', index=False)