In [19]:
# %%
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
from tqdm import tqdm

In [24]:

# Example DataFrame with columns X, Y, Z, Atom_Name, Residue_Name, Residue_ID, Atom_Type, and Timeframe
# Load nodes for all timesteps
filepath = "DATA/HB100frames.csv"  # Replace with the actual file path
df = pd.read_csv(filepath)


In [26]:

df.rename(columns={"Timeframe": "time"}, inplace=True)

# Add a combined 'node' column
df['node'] = df.apply(lambda row: f"{row['Atom_Name']}_{row['Residue_ID']}", axis=1)
df = df[df['time'] <= 255002].reset_index(drop=True)
# Display the updated dataframe
df


Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,12.759892,2.253709,33.260902,C1,CSP,1,cb,255000,C1_1
1,12.862613,3.581455,34.023949,C2,CSP,1,cb,255000,C2_1
2,11.457548,4.321817,34.003315,C3,CSP,1,cb,255000,C3_1
3,10.981806,4.421790,32.537914,C4,CSP,1,cb,255000,C4_1
4,11.038748,3.091581,31.915064,O5,CSP,1,ob,255000,O5_1
...,...,...,...,...,...,...,...,...,...
18373,22.572426,68.135628,48.624062,H8,SFL,14,ha,255002,H8_14
18374,24.384834,66.571251,48.630875,H9,SFL,14,ha,255002,H9_14
18375,21.865906,59.230358,53.457897,H10,SFL,14,ha,255002,H10_14
18376,22.107021,62.114513,56.612350,H11,SFL,14,ha,255002,H11_14


In [27]:
#Select rows where Residue_ID is 5 and Atom_Type is either 'o' or 'os'
# df1 = df[(df['Residue_ID'] == 5) & df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# Select all O Atoms
# Select relevant atom types and residue names for calculations
df1 = df[df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# df1 = df[df['Atom_Name'].str.startswith('O')].reset_index(drop=True)
df2 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'n')].reset_index(drop=True)
df3 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'hn')].reset_index(drop=True)


In [28]:
df1

Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,12.433211,1.435311,29.606905,O8,CSP,1,os,255000,O8_1
1,10.361735,2.108788,28.901592,O10,CSP,1,o,255000,O10_1
2,11.626869,5.605087,34.612240,O20,CSP,1,os,255000,O20_1
3,9.349302,5.952352,34.952969,O22,CSP,1,o,255000,O22_1
4,13.264276,3.367227,35.435097,O32,CSP,1,os,255000,O32_1
...,...,...,...,...,...,...,...,...,...
1351,53.091316,56.518734,35.831039,O2,SFL,12,o,255002,O2_12
1352,55.788090,25.110146,52.100243,O1,SFL,13,os,255002,O1_13
1353,52.054966,23.288534,51.849213,O2,SFL,13,o,255002,O2_13
1354,22.265154,63.964092,51.814655,O1,SFL,14,os,255002,O1_14


In [29]:
df2

Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,12.264816,2.733424,27.667477,N11,CSP,1,n,255000,N11_1
1,10.870185,7.574643,35.393948,N23,CSP,1,n,255000,N23_1
2,14.704678,3.006332,37.072620,N35,CSP,1,n,255000,N35_1
3,15.994886,4.793599,27.792072,N55,CSP,1,n,255000,N55_1
4,6.788294,7.013593,27.190819,N67,CSP,1,n,255000,N67_1
...,...,...,...,...,...,...,...,...,...
643,67.612061,3.893230,35.582737,N727,CSP,4,n,255002,N727_4
644,67.424667,9.088247,34.325668,N739,CSP,4,n,255002,N739_4
645,59.683479,4.133283,33.440525,N759,CSP,4,n,255002,N759_4
646,66.483582,-2.997958,33.256054,N771,CSP,4,n,255002,N771_4


In [30]:
df3

Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,13.275788,2.672540,27.647526,H9,CSP,1,hn,255000,H9_1
1,11.846371,7.829625,35.303318,H19,CSP,1,hn,255000,H19_1
2,13.847552,2.699203,37.516666,H29,CSP,1,hn,255000,H29_1
3,15.508764,3.906857,27.851604,H46,CSP,1,hn,255000,H46_1
4,6.425198,6.592360,28.037516,H56,CSP,1,hn,255000,H56_1
...,...,...,...,...,...,...,...,...,...
643,66.711334,4.256098,35.871166,H611,CSP,4,hn,255002,H611_4
644,68.040222,9.225883,33.533009,H621,CSP,4,hn,255002,H621_4
645,59.074749,4.243810,32.638405,H638,CSP,4,hn,255002,H638_4
646,66.037567,-2.783470,34.139931,H648,CSP,4,hn,255002,H648_4


In [None]:
def calculate_distance(row1, row2):
    """
    Calculate Euclidean distance between two rows based on X, Y, Z coordinates.
    
    Parameters:
        row1, row2: objects (like a pandas Series or namedtuple) with attributes X, Y, Z.
    
    Returns:
        Euclidean distance (float).
    """
    coord1 = np.array([row1.X, row1.Y, row1.Z])
    coord2 = np.array([row2.X, row2.Y, row2.Z])
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(vec1, vec2):
    """
    Calculate the angle between two vectors (in degrees).
    
    Parameters:
        vec1, vec2: numpy arrays representing the vectors.
    
    Returns:
        Angle between vec1 and vec2 in degrees (float).
    """
    # Compute cosine of the angle using the dot product
    cos_theta = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    # Clip the value to avoid any numerical issues outside the valid range for arccos
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    angle_rad = np.arccos(cos_theta)
    return np.degrees(angle_rad)


# Function to process DataFrame and save it to a CSV file
def process_and_save_dataframe(df, filename):
    """
    Process the input DataFrame to add 'ext_roll' and 'idx' columns.
    Convert all columns to integers before saving.
    Save the processed DataFrame to a specified CSV file.

    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - filename (str): Output file path
    """
    num_rows = len(df)

    # Initialize 'ext_roll' column with zeros
    df['ext_roll'] = 0

    # Assign 1 to the middle 15% rows and 2 to the last 15% rows
    df.loc[int(num_rows * 0.7):int(num_rows * 0.85) - 1, 'ext_roll'] = 1
    df.loc[int(num_rows * 0.85):, 'ext_roll'] = 2

    # Insert an 'idx' column at the beginning
    df.insert(0, 'idx', range(len(df)))

    # Reindex and retain only the required columns
    df = df[['idx', 'src', 'dst', 'time', 'label', 'ext_roll', 'nh_id']]

    # Convert all columns to integers
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Save the updated DataFrame to the specified CSV file
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

    return df

In [None]:
# Calculate angles and distances
distance_results = []
angle_results = []

for t in df1['time'].unique():
    # Filter rows for the current time:
    # For df1, we reset the index (we don't need its original index);
    # For df2 and df3, we keep the original index by not dropping it.
    df1_time = df1[df1['time'] == t].reset_index(drop=True)
    df2_time = df2[df2['time'] == t].reset_index()   # keeps original index in 'index'
    df3_time = df3[df3['time'] == t].reset_index()   # keeps original index in 'index'
    
    # For each row in df1, pair each row in df2 and df3 based on index
    for row1 in df1_time.itertuples(index=False):
        for row2, row3 in zip(df2_time.itertuples(), df3_time.itertuples()):
            # Calculate Euclidean distance between row1 (from df1) and row2 (from df2)
            distance = calculate_distance(row1, row2)
            
            # Calculate vectors from row3 (from df3) to row1 and row2
            vec3_to_df1 = np.array([row1.X - row3.X, row1.Y - row3.Y, row1.Z - row3.Z])
            vec3_to_df2 = np.array([row2.X - row3.X, row2.Y - row3.Y, row2.Z - row3.Z])
            angle = calculate_angle(vec3_to_df1, vec3_to_df2)
            
            # Append distance results if condition is met (distance <= 3.5)
            if distance <= 3.5:
                distance_results.append({
                    'src': row1.node,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst1': row2.node,
                    'dst1_x': row2.X,
                    'dst1_y': row2.Y,
                    'dst1_z': row2.Z,
                    'dst1_mol': row2.Residue_ID,
                    'nh_id': row2.Index,   # Original index from df2
                    'time': t
                })
            # Append angle results if condition is met (135 <= angle < 180)
            if 135 <= angle < 180:
                angle_results.append({
                    'src': row1.node,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst2': row3.node,
                    'dst2_x': row3.X,
                    'dst2_y': row3.Y,
                    'dst2_z': row3.Z,
                    'dst2_mol': row3.Residue_ID,
                    'nh_id': row3.Index,   # Original index from df3
                    'time': t
                })

# Convert your results lists into DataFrames
df_distance = pd.DataFrame(distance_results)
df_angle = pd.DataFrame(angle_results)

# Display DataFrames
print(df_distance)


         src      src_x      src_y      src_z  src_mol    dst1     dst1_x  \
0       O8_1  12.433211   1.435311  29.606905        1   N11_1  12.264816   
1      O10_1  10.361735   2.108788  28.901592        1   N11_1  12.264816   
2      O10_1  10.361735   2.108788  28.901592        1   N79_1   8.222926   
3      O20_1  11.626869   5.605087  34.612240        1   N23_1  10.870185   
4      O22_1   9.349302   5.952352  34.952969        1   N23_1  10.870185   
...      ...        ...        ...        ...      ...     ...        ...   
1560  O770_4  66.713104  -1.938393  31.188515        4  N783_4  69.168358   
1561  O780_4  66.975395   1.172261  31.525860        4  N783_4  69.168358   
1562  O782_4  67.982834   0.488910  33.487720        4  N783_4  69.168358   
1563   O2_11   7.569382  36.959530  33.927692       11  N407_1   4.843523   
1564   O2_11   7.569382  36.959530  33.927692       11  N475_1   8.781454   

         dst1_y     dst1_z  dst1_mol  nh_id    time  
0      2.733424  27.6

In [48]:
print(df_angle)

         src      src_x      src_y      src_z  src_mol    dst2     dst2_x  \
0       O8_1  12.433211   1.435311  29.606905        1   H46_1  15.508764   
1       O8_1  12.433211   1.435311  29.606905        1   H66_1   8.854777   
2       O8_1  12.433211   1.435311  29.606905        1   H83_1   8.434090   
3       O8_1  12.433211   1.435311  29.606905        1  H177_1   6.069104   
4       O8_1  12.433211   1.435311  29.606905        1  H194_1   7.083150   
...      ...        ...        ...        ...      ...     ...        ...   
42136  O2_14  19.025227  66.322433  51.679279       14  H342_4  61.478928   
42137  O2_14  19.025227  66.322433  51.679279       14  H352_4  63.208519   
42138  O2_14  19.025227  66.322433  51.679279       14  H436_4  57.534077   
42139  O2_14  19.025227  66.322433  51.679279       14  H453_4  59.728848   
42140  O2_14  19.025227  66.322433  51.679279       14  H611_4  66.711334   

          dst2_y     dst2_z  dst2_mol  nh_id    time  
0       3.906857  27

In [56]:


# Example merge based on common keys (adjust keys as necessary)
merged_df = pd.merge(df_distance, df_angle, 
                     on=['src', 'src_x', 'src_y', 'src_z', 'src_mol', 'time', 'nh_id'], 
                     how='outer')


# Add a new column 'type' based on the existence of dst1_coords and dst2_coords
def determine_type(row):
    dst1_exists = not pd.isna(row['dst1'])
    dst2_exists = not pd.isna(row['dst2'])
    if dst1_exists and not dst2_exists:
        return 1
    elif not dst1_exists and dst2_exists:
        return 2
    elif dst1_exists and dst2_exists:
        return 3
    else:
        return np.nan  # If neither exists

merged_df['type'] = merged_df.apply(determine_type, axis=1)


# Combine columns into list-valued columns
def combine_columns(row, cols):
    values = [row[col] for col in cols]
    if all(pd.isnull(v) for v in values):
        return np.nan
    return values

# Combine source columns into one list column called 'src_coords'
merged_df['src_coords'] = merged_df.apply(lambda row: combine_columns(row, ['src_x', 'src_y', 'src_z', 'src_mol']), axis=1)

# Combine destination columns for distance results into one list column called 'dst1_coords'
merged_df['dst1_coords'] = merged_df.apply(lambda row: combine_columns(row, ['dst1_x', 'dst1_y', 'dst1_z', 'dst1_mol']), axis=1)

# Combine destination columns for angle results into one list column called 'dst2_coords'
merged_df['dst2_coords'] = merged_df.apply(lambda row: combine_columns(row, ['dst2_x', 'dst2_y', 'dst2_z', 'dst2_mol']), axis=1)

# Optionally drop the individual coordinate columns
cols_to_drop = ['src_x', 'src_y', 'src_z', 'src_mol', 
                'dst1_x', 'dst1_y', 'dst1_z', 'dst1_mol', 
                'dst2_x', 'dst2_y', 'dst2_z', 'dst2_mol']
merged_df = merged_df.drop(columns=cols_to_drop)

print(merged_df)

         src   dst1  nh_id    time    dst2  type  \
0       O8_1  N11_1      0  255000     NaN     1   
1      O10_1  N11_1      0  255000     NaN     1   
2      O10_1  N79_1      5  255000   H66_1     3   
3      O20_1  N23_1      1  255000     NaN     1   
4      O22_1  N23_1      1  255000     NaN     1   
...      ...    ...    ...     ...     ...   ...   
43575  O2_14    NaN    189  255002  H342_4     2   
43576  O2_14    NaN    190  255002  H352_4     2   
43577  O2_14    NaN    197  255002  H436_4     2   
43578  O2_14    NaN    198  255002  H453_4     2   
43579  O2_14    NaN    211  255002  H611_4     2   

                                              src_coords  \
0      [12.43321132659912, 1.435310959815979, 29.6069...   
1      [10.361735343933104, 2.10878849029541, 28.9015...   
2      [10.361735343933104, 2.10878849029541, 28.9015...   
3      [11.626869201660156, 5.6050872802734375, 34.61...   
4      [9.349302291870115, 5.952352046966553, 34.9529...   
...            

In [None]:
# Map 'src' values to numerical values
merged_df['source'] = pd.factorize(merged_df['src'])[0] + 1  # Add 1 to start from 1 instead of 0
merged_df = merged_df.drop(columns=['src', 'dst1', 'dst2', 'src_coords' , 'dst1_coords', 'dst2_coords'])
print(merged_df)


       nh_id    time  type  src_numeric
0          0  255000     1            1
1          0  255000     1            2
2          5  255000     3            2
3          1  255000     1            3
4          1  255000     1            4
...      ...     ...   ...          ...
43575    189  255002     2          452
43576    190  255002     2          452
43577    197  255002     2          452
43578    198  255002     2          452
43579    211  255002     2          452

[43580 rows x 4 columns]


In [26]:
# Process and save distance DataFrame
final_dist_df = process_and_save_dataframe(filtered_dist_df, 'DATA/HB/edges1.csv')

# Process and save angle DataFrame
final_angle_df = process_and_save_dataframe(filtered_angle_df, 'DATA/HB/edges2.csv')

# Display the first few rows of both DataFrames
print("Updated Distance DataFrame:")
print(final_dist_df)

print("\nUpdated Angle DataFrame:")
print(final_angle_df)

Saved updated DataFrame to DATA/HB/edges1.csv
Saved updated DataFrame to DATA/HB/edges2.csv
Updated Distance DataFrame:
       idx  src  dst    time  label  ext_roll  nh_id
0        0    1  101  255000      1         0      6
1        1    2  102  255000      1         0     18
2        2    3  103  255000      1         0     16
3        3    4  104  255000      1         0     25
4        4    5  105  255000      1         0     31
...    ...  ...  ...     ...    ...       ...    ...
4155  4155   54  156  255099      1         2    193
4156  4156   36  136  255099      1         2    199
4157  4157   37  138  255099      1         2    210
4158  4158   39  140  255099      2         2     28
4159  4159   39  149  255099      2         2     33

[4160 rows x 7 columns]

Updated Angle DataFrame:
       idx  src  dst    time  label  ext_roll  nh_id
0        0    1  188  255000      1         0      6
1        1    2  189  255000      1         0     18
2        2    3  190  255000      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).a

In [None]:
# from sklearn.preprocessing import OrdinalEncoder
# new_dist_df = filtered_dist_df.copy()
# new_angle_df = filtered_angle_df.copy()
# # Combine src and dst columns from both DataFrames to ensure consistent mapping
# combined_values = pd.concat([
#     new_dist_df [['src', 'dst']],
#     new_angle_df[['src', 'dst']]
# ])

# # Fit an OrdinalEncoder on the combined src and dst values
# encoder = OrdinalEncoder(dtype=int)
# encoder.fit(combined_values)

# # Apply the encoder to the src and dst columns in both DataFrames
# new_dist_df[['src', 'dst']] = encoder.transform(new_dist_df[['src', 'dst']]) + 1  # Start from 1
# new_angle_df[['src', 'dst']] = encoder.transform(new_angle_df[['src', 'dst']]) + 1  # Start from 1

# # Display the updated DataFrames
# print("Updated new_dist_df:")
# print(new_dist_df)

# print("\nUpdated new_angle_df:")
# print(new_angle_df)

In [None]:
# # Process and save distance DataFrame
# renew_dist_df = process_and_save_dataframe(new_dist_df, 'DATA/HB/edges3.csv')

# # Process and save angle DataFrame
# renew_angle_df = process_and_save_dataframe(new_angle_df, 'DATA/HB/edges4.csv')

# # Display the first few rows of both DataFrames
# print("Updated Distance DataFrame:")
# print(renew_dist_df)

# print("\nUpdated Angle DataFrame:")
# print(renew_angle_df)

In [27]:
# # Extract src and dst column features
# src_feats = filtered_df[['src', 'src_mol']].rename(columns={'src': 'node', 'src_mol': 'mol'})
# dst_feats = filtered_df[['dst', 'dst_mol']].rename(columns={'dst': 'node', 'dst_mol': 'mol'})

# # Concatenate src and dst features
# node_feats = pd.concat([src_feats, dst_feats])

# # Drop duplicates
# node_feats = node_feats.drop_duplicates().reset_index(drop=True)

# # Save to CSV
# node_feats.to_csv('DATA/HB/node_feats.csv', index=False)

# # Display the node features
# print(node_feats)

In [28]:
# # Encode nodes into unique integers
# le_node = LabelEncoder()

# # Fit the encoder on the combined 'src' and 'dst' columns from both dataframes
# all_nodes = pd.concat([filtered_dist_df['src'], filtered_dist_df['dst'], filtered_angle_df['src'], filtered_angle_df['dst']])
# le_node.fit(all_nodes)

# # Transform the 'src' and 'dst' columns in both dataframes
# filtered_dist_df['src'] = le_node.transform(filtered_dist_df['src'])
# filtered_dist_df['dst'] = le_node.transform(filtered_dist_df['dst']) + filtered_dist_df['src'].max() + 1

# filtered_angle_df['src'] = le_node.transform(filtered_angle_df['src'])
# filtered_angle_df['dst'] = le_node.transform(filtered_angle_df['dst']) + filtered_angle_df['src'].max() + 1

# # Display the first few rows of each dataframe
# print(filtered_dist_df)
# print(filtered_angle_df)

In [29]:
# # Get unique nodes from node_1, node_2, and node_3 columns
# unique_nodes_1 = filtered_df['src'].unique()
# unique_nodes_2 = filtered_df['dst'].unique()

# # Assign a color for each group (e.g., tab10 colormap)
# color_1 = plt.cm.tab10(0)  # Color for node_1 group
# color_2 = plt.cm.tab10(1)  # Color for node_2 group
# color_3 = plt.cm.tab10(2)  # Color for node_3 group

# # Create a combined color map based on the node groups
# node_colors = {}
# for node in unique_nodes_1:
#     node_colors[node] = color_1
# for node in unique_nodes_2:
#     node_colors[node] = color_2


# # Create a graph
# G = nx.Graph()

# # Add edges between node_1 and node_2
# edges_1_2 = filtered_df[['node_1', 'node_2']].values.tolist()
# G.add_edges_from(edges_1_2)

# # Add edges between node_1 and node_3
# edges_1_3 = filtered_df[['node_1', 'node_3']].values.tolist()
# G.add_edges_from(edges_1_3)

# # Add edges between node_2 and node_3 (these will be bold)
# edges_2_3 = filtered_df[['node_2', 'node_3']].values.tolist()
# G.add_edges_from(edges_2_3)

# # Define the layout for the graph
# pos = nx.spring_layout(G)

# # Separate edges into normal and bold
# normal_edges = edges_1_2 + edges_1_3
# bold_edges = edges_2_3

# # Plot the graph
# plt.figure(figsize=(8, 4))
# nx.draw(
#     G,
#     pos,
#     with_labels=True,
#     node_size=500,
#     node_color=[node_colors[node] for node in G.nodes()],
#     font_size=10,
#     font_weight='bold',
#     edge_color='gray',
#     edgelist=normal_edges,
#     width=1  # Normal edges width
# )

# # Draw the bold edges separately
# nx.draw_networkx_edges(
#     G,
#     pos,
#     edgelist=bold_edges,
#     width=2,  # Bold edges width
#     edge_color='black'
# )

# plt.title('Graph')
# plt.show()
