In [4]:
# %%
import numpy as np
import pandas as pd

In [6]:

# Example DataFrame with columns X, Y, Z, Atom_Name, Residue_Name, Residue_ID, Atom_Type, and Timeframe

df = pd.read_csv("DATA/FILT_HB/HB100frames.csv")
df.rename(columns={"Timeframe": "time"}, inplace=True)
df['time'] = df['time'] - df['time'].min()
df['node'] = df.apply(lambda row: f"{row['Atom_Name']}_{row['Residue_ID']}", axis=1)
df



Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,12.759892,2.253709,33.260902,C1,CSP,1,cb,0,C1_1
1,12.862613,3.581455,34.023949,C2,CSP,1,cb,0,C2_1
2,11.457548,4.321817,34.003315,C3,CSP,1,cb,0,C3_1
3,10.981806,4.421790,32.537914,C4,CSP,1,cb,0,C4_1
4,11.038748,3.091581,31.915064,O5,CSP,1,ob,0,O5_1
...,...,...,...,...,...,...,...,...,...
618721,22.142454,68.691849,52.059006,H8,SFL,14,ha,100,H8_14
618722,23.826303,68.375175,53.817253,H9,SFL,14,ha,100,H9_14
618723,19.775064,62.112827,59.435371,H10,SFL,14,ha,100,H10_14
618724,21.753296,59.341610,56.864548,H11,SFL,14,ha,100,H11_14


In [7]:
#Select rows where Residue_ID is 5 and Atom_Type is either 'o' or 'os'
# df1 = df[(df['Residue_ID'] == 5) & df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# Select all O Atoms
# Select relevant atom types and residue names for calculations
df1 = df[df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# df1 = df[df['Atom_Name'].str.startswith('O')].reset_index(drop=True)
df2 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'n')].reset_index(drop=True)
df3 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'hn')].reset_index(drop=True)

In [8]:
def calculate_distance(row1, row2):
    """
    Calculate Euclidean distance between two rows based on X, Y, Z coordinates.
    
    Parameters:
        row1, row2: objects (like a pandas Series or namedtuple) with attributes X, Y, Z.
    
    Returns:
        Euclidean distance (float).
    """
    coord1 = np.array([row1.X, row1.Y, row1.Z])
    coord2 = np.array([row2.X, row2.Y, row2.Z])
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(vec1, vec2):
    """
    Calculate the angle between two vectors (in degrees).
    
    Parameters:
        vec1, vec2: numpy arrays representing the vectors.
    
    Returns:
        Angle between vec1 and vec2 in degrees (float).
    """
    # Compute cosine of the angle using the dot product
    cos_theta = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    # Clip the value to avoid any numerical issues outside the valid range for arccos
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    angle_rad = np.arccos(cos_theta)
    return np.degrees(angle_rad)


In [14]:
# Calculate angles and distances
distance_results = []
angle_results = []

for t in df1['time'].unique():
    # Filter rows for the current time:
    # For df1, we reset the index (we don't need its original index);
    # For df2 and df3, we keep the original index by not dropping it.
    df1_time = df1[df1['time'] == t].reset_index(drop=True)
    df2_time = df2[df2['time'] == t].reset_index()   # keeps original index in 'index'
    df3_time = df3[df3['time'] == t].reset_index()   # keeps original index in 'index'
    
    # For each row in df1, pair each row in df2 and df3 based on index
    for row1 in df1_time.itertuples(index=False):
        for row2, row3 in zip(df2_time.itertuples(), df3_time.itertuples()):
            # Calculate Euclidean distance between row1 (from df1) and row2 (from df2)
            distance = calculate_distance(row1, row2)
            
            # Calculate vectors from row3 (from df3) to row1 and row2
            vec3_to_df1 = np.array([row1.X - row3.X, row1.Y - row3.Y, row1.Z - row3.Z])
            vec3_to_df2 = np.array([row2.X - row3.X, row2.Y - row3.Y, row2.Z - row3.Z])
            angle = calculate_angle(vec3_to_df1, vec3_to_df2)
            
            # Append distance results if condition is met (distance <= 3.5)
            if distance <= 3.5:
                distance_results.append({
                    'source': row1.node,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst': row2.node,
                    'dst_x': row2.X,
                    'dst_y': row2.Y,
                    'dst_z': row2.Z,
                    'dst_mol': row2.Residue_ID,
                    'dst_idx': row2.Index,  
                    'time': t,
                    'distance': distance
                })
            # Append angle results if condition is met (135 <= angle < 180)
            if 135 <= angle < 180:
                angle_results.append({
                    'source': row1.node,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst': row3.node,
                    'dst_x': row3.X,
                    'dst_y': row3.Y,
                    'dst_z': row3.Z,
                    'dst_mol': row3.Residue_ID,
                    'dst_idx': row3.Index,  
                    'time': t,
                    'angle': angle
                })

# Convert your results lists into DataFrames
df_distance = pd.DataFrame(distance_results)
df_angle = pd.DataFrame(angle_results)

In [19]:
df_distance

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance
0,O8_1,12.433211,1.435311,29.606905,1,N11_1,12.264816,2.733424,27.667477,1,0,0,2.339837
1,O10_1,10.361735,2.108788,28.901592,1,N11_1,12.264816,2.733424,27.667477,1,0,0,2.352643
2,O10_1,10.361735,2.108788,28.901592,1,N79_1,8.222926,3.125758,30.759445,1,5,0,3.010041
3,O20_1,11.626869,5.605087,34.612240,1,N23_1,10.870185,7.574643,35.393948,1,1,0,2.250064
4,O22_1,9.349302,5.952352,34.952969,1,N23_1,10.870185,7.574643,35.393948,1,1,0,2.267019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53320,O780_4,67.037613,1.314091,31.964518,4,N783_4,68.806839,0.019830,31.714603,4,215,100,2.206293
53321,O782_4,68.014343,0.396020,33.875980,4,N771_4,66.593819,-2.611652,33.574306,4,214,100,3.339908
53322,O782_4,68.014343,0.396020,33.875980,4,N783_4,68.806839,0.019830,31.714603,4,215,100,2.332621
53323,O2_11,7.247553,37.513149,34.676311,11,N407_1,4.825146,37.457039,32.857952,1,27,100,3.029462


In [20]:
df_angle

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle
0,O8_1,12.433211,1.435311,29.606905,1,H46_1,15.508764,3.906857,27.851604,1,3,0,150.071960
1,O8_1,12.433211,1.435311,29.606905,1,H66_1,8.854777,2.591426,30.175131,1,5,0,146.449027
2,O8_1,12.433211,1.435311,29.606905,1,H83_1,8.434090,7.601198,32.716743,1,6,0,158.491808
3,O8_1,12.433211,1.435311,29.606905,1,H177_1,6.069104,16.899189,34.597523,1,14,0,148.473702
4,O8_1,12.433211,1.435311,29.606905,1,H194_1,7.083150,21.469265,33.176544,1,15,0,161.601357
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397266,O2_14,18.735296,65.236671,52.672852,14,H342_4,61.639530,32.120834,26.427254,4,189,100,147.678528
1397267,O2_14,18.735296,65.236671,52.672852,14,H352_4,63.748493,31.840502,35.975681,4,190,100,154.155573
1397268,O2_14,18.735296,65.236671,52.672852,14,H399_4,66.616615,31.519463,28.386499,4,194,100,137.351225
1397269,O2_14,18.735296,65.236671,52.672852,14,H453_4,59.574516,24.653601,26.177637,4,198,100,161.729362


In [55]:
# Define the key columns used for matching
key_cols = ['source', 'time', 'dst_idx']
# ['source', 'src_x', 'src_y', 'src_z', 'src_mol', 'time', 'dst_idx']

# Find the common key rows in both dataframes.
# We first select only the key columns and drop duplicates.
common_keys = pd.merge(
    df_distance[key_cols].drop_duplicates(),
    df_angle[key_cols].drop_duplicates(),
    on=key_cols,
    how='inner'
)

# Now, filter each dataframe to keep only rows with these common keys.
df_distance_common = df_distance.merge(common_keys, on=key_cols, how='inner')
df_angle_common = df_angle.merge(common_keys, on=key_cols, how='inner')

In [27]:
df_distance_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance
0,O10_1,10.361735,2.108788,28.901592,1,N79_1,8.222926,3.125758,30.759445,1,5,0,3.010041
1,O184_1,7.992947,17.905865,26.629707,1,N255_1,5.056916,18.569294,25.397465,1,17,0,3.252512
2,O210_1,6.694972,19.880878,33.721676,1,N231_1,6.883698,22.451603,33.030262,1,15,0,2.668763
3,O284_1,8.515759,29.113972,28.234339,1,N363_1,8.105013,32.489857,28.502953,1,24,0,3.411373
4,O406_1,3.561949,37.875099,31.388666,1,N451_1,1.155581,38.586143,29.658045,1,30,0,3.048154
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4169,O430_4,61.309315,34.199886,35.109455,4,N451_4,60.816998,30.871206,34.644547,4,192,100,3.396855
4170,O518_4,58.692467,26.303473,26.753759,4,N539_4,59.959152,23.914938,25.600922,4,198,100,2.939154
4171,O626_4,60.130718,14.119806,30.194197,4,N695_4,59.130833,11.390244,30.036358,4,209,100,2.911219
4172,O636_4,63.297779,11.514465,25.500071,4,N651_4,66.067627,12.940304,24.594414,4,206,100,3.244270


In [28]:
df_angle_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle
0,O10_1,10.361735,2.108788,28.901592,1,H66_1,8.854777,2.591426,30.175131,1,5,0,161.759281
1,O184_1,7.992947,17.905865,26.629707,1,H214_1,6.065125,18.591055,25.301497,1,17,0,136.799665
2,O210_1,6.694972,19.880878,33.721676,1,H194_1,7.083150,21.469265,33.176544,1,15,0,153.506310
3,O284_1,8.515759,29.113972,28.234339,1,H305_1,7.875639,31.549356,28.204592,1,24,0,146.807099
4,O406_1,3.561949,37.875099,31.388666,1,H379_1,1.872544,38.431503,30.356771,1,30,0,165.675379
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4169,O430_4,61.309315,34.199886,35.109455,4,H379_4,60.890587,31.881531,34.644096,4,192,100,167.278816
4170,O518_4,58.692467,26.303473,26.753759,4,H453_4,59.574516,24.653601,26.177637,4,198,100,162.404781
4171,O626_4,60.130718,14.119806,30.194197,4,H584_4,59.622841,12.265118,30.173088,4,209,100,164.297523
4172,O636_4,63.297779,11.514465,25.500071,4,H547_4,65.534172,12.380187,25.248531,4,206,100,138.963857


In [56]:
# Combine src and dst values for consistent mapping
combined_values = pd.concat([
    df_distance_common['source'], df_distance_common['dst'], 
    df_angle_common['source'], df_angle_common['dst']
])

# Use factorize to assign numeric indices starting from 0
numeric_indices, _ = pd.factorize(combined_values)

# Map src and dst directly to numeric indices using factorized output
mapping = pd.Series(numeric_indices, index=combined_values).to_dict()


for df in [df_distance_common, df_angle_common]:
    df['src'] = df['source'].map(mapping)
    df['dst'] = df['dst'].map(mapping)





In [51]:
df_distance_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance,src
0,O10_1,10.361735,2.108788,28.901592,1,97,8.222926,3.125758,30.759445,1,5,0,3.010041,0
1,O184_1,7.992947,17.905865,26.629707,1,98,5.056916,18.569294,25.397465,1,17,0,3.252512,1
2,O210_1,6.694972,19.880878,33.721676,1,99,6.883698,22.451603,33.030262,1,15,0,2.668763,2
3,O284_1,8.515759,29.113972,28.234339,1,100,8.105013,32.489857,28.502953,1,24,0,3.411373,3
4,O406_1,3.561949,37.875099,31.388666,1,101,1.155581,38.586143,29.658045,1,30,0,3.048154,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4169,O430_4,61.309315,34.199886,35.109455,4,152,60.816998,30.871206,34.644547,4,192,100,3.396855,52
4170,O518_4,58.692467,26.303473,26.753759,4,131,59.959152,23.914938,25.600922,4,198,100,2.939154,34
4171,O626_4,60.130718,14.119806,30.194197,4,133,59.130833,11.390244,30.036358,4,209,100,2.911219,35
4172,O636_4,63.297779,11.514465,25.500071,4,144,66.067627,12.940304,24.594414,4,206,100,3.244270,46


In [52]:
df_angle_common

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle,src
0,O10_1,10.361735,2.108788,28.901592,1,184,8.854777,2.591426,30.175131,1,5,0,161.759281,0
1,O184_1,7.992947,17.905865,26.629707,1,185,6.065125,18.591055,25.301497,1,17,0,136.799665,1
2,O210_1,6.694972,19.880878,33.721676,1,186,7.083150,21.469265,33.176544,1,15,0,153.506310,2
3,O284_1,8.515759,29.113972,28.234339,1,187,7.875639,31.549356,28.204592,1,24,0,146.807099,3
4,O406_1,3.561949,37.875099,31.388666,1,188,1.872544,38.431503,30.356771,1,30,0,165.675379,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4169,O430_4,61.309315,34.199886,35.109455,4,239,60.890587,31.881531,34.644096,4,192,100,167.278816,52
4170,O518_4,58.692467,26.303473,26.753759,4,218,59.574516,24.653601,26.177637,4,198,100,162.404781,34
4171,O626_4,60.130718,14.119806,30.194197,4,220,59.622841,12.265118,30.173088,4,209,100,164.297523,35
4172,O636_4,63.297779,11.514465,25.500071,4,231,65.534172,12.380187,25.248531,4,206,100,138.963857,46


In [None]:
def generate_node_features(df, filename):
    """
    Generate and combine source and destination node features, then save them to a CSV.
    """
    # Process source nodes
    src_features = df[['src', 'src_mol']].drop_duplicates().rename(
        columns={'src': 'node', 'src_mol': 'feat'}
    ).copy()
    src_features['feat'] = src_features['feat'].astype(int)
    
    # Process destination nodes
    dst_features = df[['dst', 'dst_mol']].drop_duplicates().rename(
        columns={'dst': 'node', 'dst_mol': 'feat'}
    ).copy()
    dst_features['feat'] = dst_features['feat'].astype(int)
  
    node_features = pd.concat([src_features, dst_features], ignore_index=True)
    node_features = node_features.drop_duplicates(subset=['node']).reset_index(drop=True)
    node_features.to_csv(filename, index=False)
    return node_features


def process_and_save_dataframe(df, filename):
    """
    Process the input DataFrame to add 'ext_roll' and 'idx' columns,
    then save it to a CSV file.
    """
    # Work on a copy of the DataFrame to avoid the SettingWithCopyWarning.
    df = df[['src', 'dst', 'time', 'dst_idx']].copy()
    num_rows = len(df)
    
    # Insert an 'idx' column at the beginning
    df.insert(0, 'idx', range(num_rows))
    
    # Initialize 'ext_roll' column with zeros using .loc for assignment clarity
    df.loc[:, 'ext_roll'] = 0

    # Assign 1 to the middle 15% rows and 2 to the last 15% rows
    df.loc[int(num_rows * 0.7):int(num_rows * 0.85) - 1, 'ext_roll'] = 1
    df.loc[int(num_rows * 0.85):, 'ext_roll'] = 2

    # Save the updated DataFrame
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

In [58]:
generate_node_features(df_distance_common, 'DATA/FILT_HB/dist_node_features.csv')
generate_node_features(df_angle_common, 'DATA/FILT_HB/angle_node_features.csv')

    
process_and_save_dataframe(df_distance_common, 'DATA/FILT_HB/edges1.csv')
process_and_save_dataframe(df_angle_common, 'DATA/FILT_HB/edges2.csv')

Saved updated DataFrame to DATA/FILT_HB/edges1.csv
Saved updated DataFrame to DATA/FILT_HB/edges2.csv


Unnamed: 0,idx,src,dst,time,dst_idx,ext_roll
0,0,0,184,0,5,0
1,1,1,185,0,17,0
2,2,2,186,0,15,0
3,3,3,187,0,24,0
4,4,4,188,0,30,0
...,...,...,...,...,...,...
4169,4169,52,239,100,192,2
4170,4170,34,218,100,198,2
4171,4171,35,220,100,209,2
4172,4172,46,231,100,206,2
