# Stanford RNA 3D Folding Part 2 - A-Form Helix Baseline

Simple baseline that generates A-form RNA helix coordinates with variations.
Ready for immediate submission - no additional datasets required.

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.transform import Rotation

# A-form RNA helix parameters (Angstroms)
A_FORM_PARAMS = {
    'rise': 2.8,      # Rise per residue
    'twist': 32.7,    # Twist angle (degrees)
    'radius': 9.0     # Helix radius
}

def generate_helix_coords(sequence, variation_idx=0):
    """Generate A-form helix C1' coordinates for an RNA sequence."""
    length = len(sequence)
    np.random.seed(variation_idx * 42)
    variation = 0.9 + 0.2 * np.random.random()
    
    params = {
        'rise': A_FORM_PARAMS['rise'] * (0.9 + 0.2 * np.random.random()),
        'twist': np.deg2rad(A_FORM_PARAMS['twist'] * (0.9 + 0.2 * np.random.random())),
        'radius': A_FORM_PARAMS['radius'] * variation
    }
    
    coords = []
    for i in range(length):
        angle = i * params['twist']
        x = params['radius'] * np.cos(angle)
        y = params['radius'] * np.sin(angle)
        z = i * params['rise']
        coords.append([x, y, z])
    
    coords = np.array(coords)
    
    # Apply random rotation for variation
    if variation_idx > 0:
        rotation = Rotation.from_euler('xyz', 
            [np.random.uniform(-30, 30) for _ in range(3)], 
            degrees=True).as_matrix()
        coords = coords @ rotation
    
    # Center the structure
    coords = coords - np.mean(coords, axis=0)
    return coords

print("Functions defined.")

In [None]:
# Load data
sample = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/sample_submission.csv')
test_df = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/test_sequences.csv')

# Build sequence lookup
seq_lookup = dict(zip(test_df['target_id'], test_df['sequence']))

print(f"Loaded {len(test_df)} sequences")
print(f"Total residues: {len(sample)}")

In [None]:
# Generate predictions
all_data = []
current_target = None
current_models = None

for idx, row in sample.iterrows():
    id_parts = row['ID'].rsplit('_', 1)
    target_id = id_parts[0]
    resid = int(id_parts[1])
    
    # Generate models for new target
    if target_id != current_target:
        current_target = target_id
        sequence = seq_lookup[target_id]
        current_models = [generate_helix_coords(sequence, i) for i in range(5)]
        print(f"Processing {target_id} (len={len(sequence)})")
    
    # Get coordinates for this residue
    i = resid - 1
    row_data = {
        'ID': row['ID'],
        'resname': row['resname'],
        'resid': row['resid']
    }
    
    for model_idx, model_coords in enumerate(current_models, 1):
        row_data[f'x_{model_idx}'] = float(model_coords[i, 0])
        row_data[f'y_{model_idx}'] = float(model_coords[i, 1])
        row_data[f'z_{model_idx}'] = float(model_coords[i, 2])
    
    all_data.append(row_data)

print(f"\nProcessed {len(all_data)} residues")

In [None]:
# Create submission DataFrame
submission_df = pd.DataFrame(all_data)

# Ensure correct column order
column_order = ['ID', 'resname', 'resid']
for i in range(1, 6):
    column_order.extend([f'x_{i}', f'y_{i}', f'z_{i}'])

submission_df = submission_df[column_order]

# Save submission
submission_df.to_csv('submission.csv', index=False, float_format='%.3f')

# Validate
print(f"Submission shape: {submission_df.shape}")
print(f"ID match: {(submission_df['ID'] == sample['ID']).all()}")
print(f"resname match: {(submission_df['resname'] == sample['resname']).all()}")
print(f"resid match: {(submission_df['resid'] == sample['resid']).all()}")

submission_df.head(10)