In [52]:
# %%
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
from tqdm import tqdm

In [53]:

# Example DataFrame with columns X, Y, Z, Atom_Name, Residue_Name, Residue_ID, Atom_Type, and Timeframe
# Load nodes for all timesteps
filepath = "/home/mhanowar/Downloads/HB1000frames.csv"  # Replace with the actual file path
data = pd.read_csv(filepath)


In [54]:
# Filter the dataframe based on Timeframe
df = data.loc[data['Timeframe'] < 255002].copy()
df.rename(columns={"Timeframe": "time"}, inplace=True)

# Add a combined 'node' column
df['node'] = df.apply(lambda row: f"{row['Atom_Name']}_{row['Residue_ID']}", axis=1)
# Display the updated dataframe
df


Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,12.759892,2.253709,33.260902,C1,CSP,1,cb,255000,C1_1
1,12.862613,3.581455,34.023949,C2,CSP,1,cb,255000,C2_1
2,11.457548,4.321817,34.003315,C3,CSP,1,cb,255000,C3_1
3,10.981806,4.421790,32.537914,C4,CSP,1,cb,255000,C4_1
4,11.038748,3.091581,31.915064,O5,CSP,1,ob,255000,O5_1
...,...,...,...,...,...,...,...,...,...
12247,22.502382,67.522812,49.062340,H8,SFL,14,ha,255001,H8_14
12248,24.567900,66.078522,49.387047,H9,SFL,14,ha,255001,H9_14
12249,20.595137,58.445484,53.666969,H10,SFL,14,ha,255001,H10_14
12250,22.484032,60.872322,56.734215,H11,SFL,14,ha,255001,H11_14


In [55]:
#Select rows where Residue_ID is 5 and Atom_Type is either 'o' or 'os'
# df1 = df[(df['Residue_ID'] == 5) & df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# Select all O Atoms
# Select relevant atom types and residue names for calculations
df1 = df[df['Atom_Name'].str.startswith('O')].reset_index(drop=True)
df2 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'n')].reset_index(drop=True)
df3 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'hn')].reset_index(drop=True)

In [56]:
# Helper Functions
def calculate_angle(vec1, vec2):
    """Calculate angle between two vectors."""
    cos_theta = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    angle_rad = np.arccos(np.clip(cos_theta, -1.0, 1.0))
    return np.degrees(angle_rad)

def calculate_distance(row1, row2):
    """Calculate Euclidean distance."""
    return np.linalg.norm(np.array([row1.X, row1.Y, row1.Z]) - np.array([row2.X, row2.Y, row2.Z]))

def process_results(df):
    """Process DataFrame to add molecular IDs, unique nh_id, and labels."""
    df['src_mol'] = df['src'].apply(lambda x: int(x.split('_')[1]))
    df['dst_mol'] = df['dst'].apply(lambda x: int(x.split('_')[1]))
    df['nh_id'] = pd.factorize(df['dst'])[0] + 1
    df['label'] = np.where(
        (df['src_mol'].between(5, 14)) & (df['dst_mol'].between(1, 4)), 2, 1
    )
    return df

# Function to process DataFrame and save it to a CSV file
def process_and_save_dataframe(df, filename):
    """
    Process the input DataFrame to add 'ext_roll' and 'idx' columns.
    Save the processed DataFrame to a specified CSV file.

    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - filename (str): Output file path
    """
    num_rows = len(df)

    # Initialize 'ext_roll' column with zeros
    df['ext_roll'] = 0

    # Assign 1 to the middle 15% rows and 2 to the last 15% rows
    df.loc[int(num_rows * 0.7):int(num_rows * 0.85) - 1, 'ext_roll'] = 1
    df.loc[int(num_rows * 0.85):, 'ext_roll'] = 2

    # Insert an 'idx' column at the beginning
    df.insert(0, 'idx', range(len(df)))

    # Reindex and retain only the required columns
    df = df[['idx', 'src', 'dst', 'time', 'label', 'ext_roll', 'nh_id']]

    # Save the updated DataFrame to the specified CSV file
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

    return df




In [57]:
# Calculate angles and distances
distance_results = []
angle_results = []

for t in df1['time'].unique():
    df1_time = df1[df1['time'] == t].reset_index(drop=True)
    df2_time = df2[df2['time'] == t].reset_index(drop=True)
    df3_time = df3[df3['time'] == t].reset_index(drop=True)

    for row1 in df1_time.itertuples(index=False):
        for idx, row3 in df3_time.iterrows():
            if idx < len(df2_time):
                row2 = df2_time.loc[idx]

                # Vectors and calculations
                vec3_to_df1 = np.array([row1.X - row3.X, row1.Y - row3.Y, row1.Z - row3.Z])
                vec3_to_df2 = np.array([row2['X'] - row3.X, row2['Y'] - row3.Y, row2['Z'] - row3.Z])
                angle = calculate_angle(vec3_to_df1, vec3_to_df2)
                distance = calculate_distance(row1, row2)

                # Append results
                distance_results.append({'src': row1.node, 'dst': row2['node'], 'time': t, 'distance': distance})
                angle_results.append({'src': row1.node, 'dst': row3['node'], 'time': t, 'angle': angle})

# Convert results to DataFrames
dist_df = process_results(pd.DataFrame(distance_results))
angle_df = process_results(pd.DataFrame(angle_results))

# Display DataFrames
print(dist_df)
print(angle_df)

          src     dst    time   distance  src_mol  dst_mol  nh_id  label
0        O5_1   N11_1  255000   4.435484        1        1      1      1
1        O5_1   N23_1  255000   5.677049        1        1      2      1
2        O5_1   N35_1  255000   6.328246        1        1      3      1
3        O5_1   N55_1  255000   6.667776        1        1      4      1
4        O5_1   N67_1  255000   7.467732        1        1      5      1
...       ...     ...     ...        ...      ...      ...    ...    ...
257467  O2_14  N727_4  255001  80.193537       14        4    212      2
257468  O2_14  N739_4  255001  76.475652       14        4    213      2
257469  O2_14  N759_4  255001  76.042337       14        4    214      2
257470  O2_14  N771_4  255001  85.641525       14        4    215      2
257471  O2_14  N783_4  255001  84.852530       14        4    216      2

[257472 rows x 8 columns]
          src     dst    time       angle  src_mol  dst_mol  nh_id  label
0        O5_1    H9_1  

In [58]:
# Filter and label the distance DataFrame
filtered_dist_df = dist_df[dist_df['distance'] <= 15].copy()
filtered_dist_df.reset_index(drop=True, inplace=True)

# Filter and label the angle DataFrame
filtered_angle_df = angle_df[(angle_df['angle'] >= 105) & (angle_df['angle'] < 180)].copy()
filtered_angle_df.reset_index(drop=True, inplace=True)

# Combine src and dst values for consistent mapping
combined_values = pd.concat([
    filtered_dist_df['src'], filtered_dist_df['dst'], 
    filtered_angle_df['src'], filtered_angle_df['dst']
])

# Use factorize to assign numeric indices
numeric_indices, _ = pd.factorize(combined_values)

# Map src and dst directly to numeric indices using factorized output
mapping = pd.Series(numeric_indices, index=combined_values).to_dict()

# Replace src and dst with numeric indices
for df in [filtered_dist_df, filtered_angle_df]:
    df['src'] = df['src'].map(mapping)
    df['dst'] = df['dst'].map(mapping)

# Display the filtered DataFrames
print("Filtered Distance DataFrame:")
print(filtered_dist_df)

print("\nFiltered Angle DataFrame:")
print(filtered_angle_df.head())

Filtered Distance DataFrame:
       src  dst    time   distance  src_mol  dst_mol  nh_id  label
0        0  586  255000   4.435484        1        1      1      1
1        0  587  255000   5.677049        1        1      2      1
2        0  588  255000   6.328246        1        1      3      1
3        0  589  255000   6.667776        1        1      4      1
4        0  590  255000   7.467732        1        1      5      1
...    ...  ...     ...        ...      ...      ...    ...    ...
24388  585  742  255001  13.913711       12        4    176      2
24389  585  738  255001  12.650958       12        4    177      2
24390  585  746  255001  11.569930       12        4    179      2
24391  585  741  255001   7.632603       12        4    180      2
24392  585  743  255001  12.214476       12        4    181      2

[24393 rows x 8 columns]

Filtered Angle DataFrame:
   src  dst    time       angle  src_mol  dst_mol  nh_id  label
0    0  812  255000  120.540965        1        1 

In [59]:
# Process and save distance DataFrame
final_dist_df = process_and_save_dataframe(filtered_dist_df, 'DATA/HB/edges1.csv')

# Process and save angle DataFrame
final_angle_df = process_and_save_dataframe(filtered_angle_df, 'DATA/HB/edges2.csv')

# Display the first few rows of both DataFrames
print("Updated Distance DataFrame:")
print(final_dist_df.head)

print("\nUpdated Angle DataFrame:")
print(final_angle_df.head())

Saved updated DataFrame to DATA/HB/edges1.csv
Saved updated DataFrame to DATA/HB/edges2.csv
Updated Distance DataFrame:
<bound method NDFrame.head of          idx  src  dst    time  label  ext_roll  nh_id
0          0    0  586  255000      1         0      1
1          1    0  587  255000      1         0      2
2          2    0  588  255000      1         0      3
3          3    0  589  255000      1         0      4
4          4    0  590  255000      1         0      5
...      ...  ...  ...     ...    ...       ...    ...
24388  24388  585  742  255001      2         2    176
24389  24389  585  738  255001      2         2    177
24390  24390  585  746  255001      2         2    179
24391  24391  585  741  255001      2         2    180
24392  24392  585  743  255001      2         2    181

[24393 rows x 7 columns]>

Updated Angle DataFrame:
   idx  src  dst    time  label  ext_roll  nh_id
0    0    0  812  255000      1         0      4
1    1    0  813  255000      1        

In [60]:
# # Extract src and dst column features
# src_feats = filtered_df[['src', 'src_mol']].rename(columns={'src': 'node', 'src_mol': 'mol'})
# dst_feats = filtered_df[['dst', 'dst_mol']].rename(columns={'dst': 'node', 'dst_mol': 'mol'})

# # Concatenate src and dst features
# node_feats = pd.concat([src_feats, dst_feats])

# # Drop duplicates
# node_feats = node_feats.drop_duplicates().reset_index(drop=True)

# # Save to CSV
# node_feats.to_csv('DATA/HB/node_feats.csv', index=False)

# # Display the node features
# print(node_feats)

In [61]:
# # Encode nodes into unique integers
# le_node = LabelEncoder()

# # Fit the encoder on the combined 'src' and 'dst' columns from both dataframes
# all_nodes = pd.concat([filtered_dist_df['src'], filtered_dist_df['dst'], filtered_angle_df['src'], filtered_angle_df['dst']])
# le_node.fit(all_nodes)

# # Transform the 'src' and 'dst' columns in both dataframes
# filtered_dist_df['src'] = le_node.transform(filtered_dist_df['src'])
# filtered_dist_df['dst'] = le_node.transform(filtered_dist_df['dst']) + filtered_dist_df['src'].max() + 1

# filtered_angle_df['src'] = le_node.transform(filtered_angle_df['src'])
# filtered_angle_df['dst'] = le_node.transform(filtered_angle_df['dst']) + filtered_angle_df['src'].max() + 1

# # Display the first few rows of each dataframe
# print(filtered_dist_df)
# print(filtered_angle_df)

In [62]:
# # Get unique nodes from node_1, node_2, and node_3 columns
# unique_nodes_1 = filtered_df['src'].unique()
# unique_nodes_2 = filtered_df['dst'].unique()

# # Assign a color for each group (e.g., tab10 colormap)
# color_1 = plt.cm.tab10(0)  # Color for node_1 group
# color_2 = plt.cm.tab10(1)  # Color for node_2 group
# color_3 = plt.cm.tab10(2)  # Color for node_3 group

# # Create a combined color map based on the node groups
# node_colors = {}
# for node in unique_nodes_1:
#     node_colors[node] = color_1
# for node in unique_nodes_2:
#     node_colors[node] = color_2


# # Create a graph
# G = nx.Graph()

# # Add edges between node_1 and node_2
# edges_1_2 = filtered_df[['node_1', 'node_2']].values.tolist()
# G.add_edges_from(edges_1_2)

# # Add edges between node_1 and node_3
# edges_1_3 = filtered_df[['node_1', 'node_3']].values.tolist()
# G.add_edges_from(edges_1_3)

# # Add edges between node_2 and node_3 (these will be bold)
# edges_2_3 = filtered_df[['node_2', 'node_3']].values.tolist()
# G.add_edges_from(edges_2_3)

# # Define the layout for the graph
# pos = nx.spring_layout(G)

# # Separate edges into normal and bold
# normal_edges = edges_1_2 + edges_1_3
# bold_edges = edges_2_3

# # Plot the graph
# plt.figure(figsize=(8, 4))
# nx.draw(
#     G,
#     pos,
#     with_labels=True,
#     node_size=500,
#     node_color=[node_colors[node] for node in G.nodes()],
#     font_size=10,
#     font_weight='bold',
#     edge_color='gray',
#     edgelist=normal_edges,
#     width=1  # Normal edges width
# )

# # Draw the bold edges separately
# nx.draw_networkx_edges(
#     G,
#     pos,
#     edgelist=bold_edges,
#     width=2,  # Bold edges width
#     edge_color='black'
# )

# plt.title('Graph')
# plt.show()
