In [18]:
# %%
import numpy as np
import pandas as pd
import os
import torch
import logging

In [None]:

# Example DataFrame with columns X, Y, Z, Atom_Name, Residue_Name, Residue_ID, Atom_Type, and Timeframe
df = pd.read_csv("DATA/FILT_HB/HB1000frames.csv")
df.rename(columns={"Timeframe": "time"}, inplace=True)
df['time'] = df['time'] - df['time'].min()
df['elem'] = df.apply(lambda row: f"{row['Atom_Name']}_{row['Residue_ID']}", axis=1)
df



Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,elem
0,12.759892,2.253709,33.260902,C1,CSP,1,cb,0,C1_1
1,12.862613,3.581455,34.023949,C2,CSP,1,cb,0,C2_1
2,11.457548,4.321817,34.003315,C3,CSP,1,cb,0,C3_1
3,10.981806,4.421790,32.537914,C4,CSP,1,cb,0,C4_1
4,11.038748,3.091581,31.915064,O5,CSP,1,ob,0,O5_1
...,...,...,...,...,...,...,...,...,...
6121,22.220211,66.814766,50.109936,H8,SFL,14,ha,0,H8_14
6122,24.264881,65.406349,50.476776,H9,SFL,14,ha,0,H9_14
6123,20.089758,57.912495,55.133987,H10,SFL,14,ha,0,H10_14
6124,22.042284,60.422878,57.926788,H11,SFL,14,ha,0,H11_14


In [20]:
#Select rows where Residue_ID is 5 and Atom_Type is either 'o' or 'os'
# df1 = df[(df['Residue_ID'] == 5) & df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# Select all O Atoms
# Select relevant atom types and residue names for calculations
df1 = df[df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# df1 = df[df['Atom_Name'].str.startswith('O')].reset_index(drop=True)
df2 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'n')].reset_index(drop=True)
df3 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'hn')].reset_index(drop=True)

In [21]:
def calculate_distance(row1, row2):
    """
    Calculate Euclidean distance between two rows based on X, Y, Z coordinates.
    
    Parameters:
        row1, row2: objects (like a pandas Series or namedtuple) with attributes X, Y, Z.
    
    Returns:
        Euclidean distance (float).
    """
    coord1 = np.array([row1.X, row1.Y, row1.Z])
    coord2 = np.array([row2.X, row2.Y, row2.Z])
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(vec1, vec2):
    """
    Calculate the angle between two vectors (in degrees).
    
    Parameters:
        vec1, vec2: numpy arrays representing the vectors.
    
    Returns:
        Angle between vec1 and vec2 in degrees (float).
    """
    # Compute cosine of the angle using the dot product
    cos_theta = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    # Clip the value to avoid any numerical issues outside the valid range for arccos
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    angle_rad = np.arccos(cos_theta)
    return np.degrees(angle_rad)


In [22]:
# Calculate angles and distances
distance_results = []
angle_results = []

for t in df1['time'].unique():
    # Filter rows for the current time:
    # For df1, we reset the index (we don't need its original index);
    # For df2 and df3, we keep the original index by not dropping it.
    df1_time = df1[df1['time'] == t].reset_index(drop=True)
    df2_time = df2[df2['time'] == t].reset_index()   # keeps original index in 'index'
    df3_time = df3[df3['time'] == t].reset_index()   # keeps original index in 'index'
    
    # For each row in df1, pair each row in df2 and df3 based on index
    for row1 in df1_time.itertuples(index=False):
        for row2, row3 in zip(df2_time.itertuples(), df3_time.itertuples()):
            # Calculate Euclidean distance between row1 (from df1) and row2 (from df2)
            distance = calculate_distance(row1, row2)
            
            # Calculate vectors from row3 (from df3) to row1 and row2
            vec3_to_df1 = np.array([row1.X - row3.X, row1.Y - row3.Y, row1.Z - row3.Z])
            vec3_to_df2 = np.array([row2.X - row3.X, row2.Y - row3.Y, row2.Z - row3.Z])
            angle = calculate_angle(vec3_to_df1, vec3_to_df2)
            
            # Append distance results if condition is met (distance <= 3.5)
            if distance <= 3.5:
                distance_results.append({
                    'source': row1.elem,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst': row2.elem,
                    'dst_x': row2.X,
                    'dst_y': row2.Y,
                    'dst_z': row2.Z,
                    'dst_mol': row2.Residue_ID,
                    'dst_idx': row2.Index,  
                    'time': t,
                    'distance': distance
                })
            # Append angle results if condition is met (135 <= angle < 180)
            if 135 <= angle < 180:
                angle_results.append({
                    'source': row1.elem,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst': row3.elem,
                    'dst_x': row3.X,
                    'dst_y': row3.Y,
                    'dst_z': row3.Z,
                    'dst_mol': row3.Residue_ID,
                    'dst_idx': row3.Index,  
                    'time': t,
                    'angle': angle
                })

# Convert your results lists into DataFrames
df_distance = pd.DataFrame(distance_results)
df_angle = pd.DataFrame(angle_results)

In [23]:
df_distance

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance
0,O8_1,12.433211,1.435311,29.606905,1,N11_1,12.264816,2.733424,27.667477,1,0,0,2.339837
1,O10_1,10.361735,2.108788,28.901592,1,N11_1,12.264816,2.733424,27.667477,1,0,0,2.352643
2,O10_1,10.361735,2.108788,28.901592,1,N79_1,8.222926,3.125758,30.759445,1,5,0,3.010041
3,O20_1,11.626869,5.605087,34.612240,1,N23_1,10.870185,7.574643,35.393948,1,1,0,2.250064
4,O22_1,9.349302,5.952352,34.952969,1,N23_1,10.870185,7.574643,35.393948,1,1,0,2.267019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,O770_4,66.772491,-1.704829,31.064491,4,N783_4,69.011452,0.405388,31.569620,4,215,0,3.117871
517,O780_4,67.096054,1.456644,31.661438,4,N783_4,69.011452,0.405388,31.569620,4,215,0,2.186851
518,O782_4,68.125221,0.886503,33.611347,4,N783_4,69.011452,0.405388,31.569620,4,215,0,2.277175
519,O2_11,7.913268,36.730728,33.888664,11,N407_1,5.027152,37.718132,33.145878,1,27,0,3.139484


In [24]:
df_angle

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle
0,O8_1,12.433211,1.435311,29.606905,1,H46_1,15.508764,3.906857,27.851604,1,3,0,150.071960
1,O8_1,12.433211,1.435311,29.606905,1,H66_1,8.854777,2.591426,30.175131,1,5,0,146.449027
2,O8_1,12.433211,1.435311,29.606905,1,H83_1,8.434090,7.601198,32.716743,1,6,0,158.491808
3,O8_1,12.433211,1.435311,29.606905,1,H177_1,6.069104,16.899189,34.597523,1,14,0,148.473702
4,O8_1,12.433211,1.435311,29.606905,1,H194_1,7.083150,21.469265,33.176544,1,15,0,161.601357
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13611,O2_14,18.677591,65.035980,52.859055,14,H342_4,61.305489,32.160271,26.468689,4,189,0,153.166805
13612,O2_14,18.677591,65.035980,52.859055,14,H352_4,62.975449,31.571283,35.977997,4,190,0,172.072480
13613,O2_14,18.677591,65.035980,52.859055,14,H436_4,57.428974,28.395250,28.561724,4,197,0,142.077027
13614,O2_14,18.677591,65.035980,52.859055,14,H453_4,59.616276,24.368275,25.962568,4,198,0,161.943814


In [25]:
# Define the key columns used for matching
key_cols = ['source', 'time', 'dst_idx']
# ['source', 'src_x', 'src_y', 'src_z', 'src_mol', 'time', 'dst_idx']

# Find the common key rows in both dataframes.
# We first select only the key columns and drop duplicates.
common_keys = pd.merge(
    df_distance[key_cols].drop_duplicates(),
    df_angle[key_cols].drop_duplicates(),
    on=key_cols,
    how='inner'
)

# Now, filter each dataframe to keep only rows with these common keys.
df_distance_common = df_distance.merge(common_keys, on=key_cols, how='inner')
df_angle_common = df_angle.merge(common_keys, on=key_cols, how='inner')

In [26]:
df_distance_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance
0,O10_1,10.361735,2.108788,28.901592,1,N79_1,8.222926,3.125758,30.759445,1,5,0,3.010041
1,O184_1,7.992947,17.905865,26.629707,1,N255_1,5.056916,18.569294,25.397465,1,17,0,3.252512
2,O210_1,6.694972,19.880878,33.721676,1,N231_1,6.883698,22.451603,33.030262,1,15,0,2.668763
3,O284_1,8.515759,29.113972,28.234339,1,N363_1,8.105013,32.489857,28.502953,1,24,0,3.411373
4,O406_1,3.561949,37.875099,31.388666,1,N451_1,1.155581,38.586143,29.658045,1,30,0,3.048154
5,O450_1,0.905972,40.174129,28.04892,1,N375_1,-0.188102,38.126186,25.88077,1,25,0,3.176782
6,O492_1,6.153981,46.267063,27.492346,1,N563_1,4.202358,48.370407,26.313599,1,38,0,3.101988
7,O518_1,5.912088,46.759968,34.716255,1,N539_1,7.181493,49.66291,34.801197,1,36,0,3.169492
8,O624_1,4.566501,56.898605,32.142868,1,N583_1,3.352883,54.048855,33.233223,1,39,0,3.283721
9,O714_1,8.888943,65.198433,30.869387,1,N639_1,8.924366,63.455074,33.302853,1,43,0,2.993712


In [27]:
df_angle_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle
0,O10_1,10.361735,2.108788,28.901592,1,H66_1,8.854777,2.591426,30.175131,1,5,0,161.759281
1,O184_1,7.992947,17.905865,26.629707,1,H214_1,6.065125,18.591055,25.301497,1,17,0,136.799665
2,O210_1,6.694972,19.880878,33.721676,1,H194_1,7.08315,21.469265,33.176544,1,15,0,153.50631
3,O284_1,8.515759,29.113972,28.234339,1,H305_1,7.875639,31.549356,28.204592,1,24,0,146.807099
4,O406_1,3.561949,37.875099,31.388666,1,H379_1,1.872544,38.431503,30.356771,1,30,0,165.675379
5,O450_1,0.905972,40.174129,28.04892,1,H315_1,0.163979,38.568935,26.721113,1,25,0,158.744752
6,O492_1,6.153981,46.267063,27.492346,1,H473_1,4.762031,47.541893,26.476385,1,38,0,157.123106
7,O518_1,5.912088,46.759968,34.716255,1,H453_1,6.832856,48.719318,34.681793,1,36,0,170.898509
8,O624_1,4.566501,56.898605,32.142868,1,H490_1,3.384632,55.019894,32.946434,1,39,0,151.043155
9,O714_1,8.888943,65.198433,30.869387,1,H537_1,8.867854,64.047417,32.483036,1,43,0,176.176465


In [28]:
# Combine src and dst values for consistent mapping
combined_values = pd.concat([
    df_distance_common['source'], df_distance_common['dst'], 
    df_angle_common['source'], df_angle_common['dst']
])

# Use factorize to assign numeric indices starting from 0
numeric_indices, _ = pd.factorize(combined_values)

# Map src and dst directly to numeric indices using factorized output
mapping = pd.Series(numeric_indices, index=combined_values).to_dict()


for df in [df_distance_common, df_angle_common]:
    df['src'] = df['source'].map(mapping)
    df['dst'] = df['dst'].map(mapping)





In [29]:
df_distance_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance,src
0,O10_1,10.361735,2.108788,28.901592,1,38,8.222926,3.125758,30.759445,1,5,0,3.010041,0
1,O184_1,7.992947,17.905865,26.629707,1,39,5.056916,18.569294,25.397465,1,17,0,3.252512,1
2,O210_1,6.694972,19.880878,33.721676,1,40,6.883698,22.451603,33.030262,1,15,0,2.668763,2
3,O284_1,8.515759,29.113972,28.234339,1,41,8.105013,32.489857,28.502953,1,24,0,3.411373,3
4,O406_1,3.561949,37.875099,31.388666,1,42,1.155581,38.586143,29.658045,1,30,0,3.048154,4
5,O450_1,0.905972,40.174129,28.04892,1,43,-0.188102,38.126186,25.88077,1,25,0,3.176782,5
6,O492_1,6.153981,46.267063,27.492346,1,44,4.202358,48.370407,26.313599,1,38,0,3.101988,6
7,O518_1,5.912088,46.759968,34.716255,1,45,7.181493,49.66291,34.801197,1,36,0,3.169492,7
8,O624_1,4.566501,56.898605,32.142868,1,46,3.352883,54.048855,33.233223,1,39,0,3.283721,8
9,O714_1,8.888943,65.198433,30.869387,1,47,8.924366,63.455074,33.302853,1,43,0,2.993712,9


In [30]:
df_angle_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle,src
0,O10_1,10.361735,2.108788,28.901592,1,77,8.854777,2.591426,30.175131,1,5,0,161.759281,0
1,O184_1,7.992947,17.905865,26.629707,1,78,6.065125,18.591055,25.301497,1,17,0,136.799665,1
2,O210_1,6.694972,19.880878,33.721676,1,79,7.08315,21.469265,33.176544,1,15,0,153.50631,2
3,O284_1,8.515759,29.113972,28.234339,1,80,7.875639,31.549356,28.204592,1,24,0,146.807099,3
4,O406_1,3.561949,37.875099,31.388666,1,81,1.872544,38.431503,30.356771,1,30,0,165.675379,4
5,O450_1,0.905972,40.174129,28.04892,1,82,0.163979,38.568935,26.721113,1,25,0,158.744752,5
6,O492_1,6.153981,46.267063,27.492346,1,83,4.762031,47.541893,26.476385,1,38,0,157.123106,6
7,O518_1,5.912088,46.759968,34.716255,1,84,6.832856,48.719318,34.681793,1,36,0,170.898509,7
8,O624_1,4.566501,56.898605,32.142868,1,85,3.384632,55.019894,32.946434,1,39,0,151.043155,8
9,O714_1,8.888943,65.198433,30.869387,1,86,8.867854,64.047417,32.483036,1,43,0,176.176465,9


In [31]:
def generate_node_features(df1, df2, filename):
    """
    Generate node features by combining:
      - Source nodes from df1: use 'src_mol'
      - Destination nodes from df1: use 'dst_mol'
      - Destination nodes from df2: use 'dst_mol'
    The destination node IDs in df2 are assumed to start after the max dst in df1.
    The combined node features are then extracted as a single column (named 'feat'),
    converted to a PyTorch tensor of shape [1, dim], and saved as a .pt file.
    """
    # Process source nodes from df1
    src_features = df1[['src', 'src_mol']].drop_duplicates().rename(
        columns={'src': 'node', 'src_mol': 'feat'}
    ).copy()
    # Convert to integer values (adjust if needed)
    src_features['feat'] = src_features['feat'].astype(int)
    
    # Process destination nodes from df1
    dst_features_df1 = df1[['dst', 'dst_mol']].drop_duplicates().rename(
        columns={'dst': 'node', 'dst_mol': 'feat'}
    ).copy()
    dst_features_df1['feat'] = dst_features_df1['feat'].astype(int)
    
    # Process destination nodes from df2
    dst_features_df2 = df2[['dst', 'dst_mol']].drop_duplicates().rename(
        columns={'dst': 'node', 'dst_mol': 'feat'}
    ).copy()
    dst_features_df2['feat'] = dst_features_df2['feat'].astype(int)
    
    # Determine an offset such that df2 destination node IDs come after those of df1.
    if not dst_features_df1.empty:
        offset = dst_features_df1['node'].max() + 1
    else:
        offset = 0
    dst_features_df2['node'] = dst_features_df2['node'] + offset
    
    # Combine all nodes: source nodes, then destination nodes from df1, then from df2.
    all_nodes = pd.concat([src_features, dst_features_df1, dst_features_df2], ignore_index=True)
    
    # Drop duplicate node entries based on the 'node' column
    all_nodes = all_nodes.drop_duplicates(subset=['node']).reset_index(drop=True)
    
    # Optionally, sort by node if desired
    all_nodes = all_nodes.sort_values(by='node').reset_index(drop=True)
    
    # Extract only the 'feat' column as a NumPy array.
    feat_np = all_nodes['feat'].values
    # Convert to a PyTorch tensor and add a batch dimension to get shape [1, dim].
    feat_tensor = torch.tensor(feat_np, dtype=torch.long).unsqueeze(0)
    
    # Save the tensor as a .pt file.
    torch.save(feat_tensor, filename)
    logging.info(f"Saved node features tensor to {filename}")
    
    return feat_tensor

In [32]:

def process_and_save_dataframe(df, filename):
    """
    Process the input DataFrame to add 'ext_roll' and 'idx' columns,
    then save it to a CSV file.
    """
    # Work on a copy of the DataFrame to avoid the SettingWithCopyWarning.
    df = df[['src', 'dst', 'time', 'dst_idx']].copy()
    num_rows = len(df)
    
    # Insert an 'idx' column at the beginning
    df.insert(0, 'idx', range(num_rows))
    
    # Initialize 'ext_roll' column with zeros using .loc for assignment clarity
    df.loc[:, 'ext_roll'] = 0

    # Assign 1 to the middle 15% rows and 2 to the last 15% rows
    df.loc[int(num_rows * 0.7):int(num_rows * 0.85) - 1, 'ext_roll'] = 1
    df.loc[int(num_rows * 0.85):, 'ext_roll'] = 2

    # Save the updated DataFrame
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

In [33]:
generate_node_features(df_distance_common, df_angle_common, 'DATA/FILT_HB/node_features.pt')

    
process_and_save_dataframe(df_distance_common, 'DATA/FILT_HB/edges1.csv')
process_and_save_dataframe(df_angle_common, 'DATA/FILT_HB/edges2.csv')

Saved updated DataFrame to DATA/FILT_HB/edges1.csv
Saved updated DataFrame to DATA/FILT_HB/edges2.csv
