In [17]:
# %%
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
from tqdm import tqdm

In [18]:

# Example DataFrame with columns X, Y, Z, Atom_Name, Residue_Name, Residue_ID, Atom_Type, and Timeframe
# Load nodes for all timesteps
filepath = "/home/mhanowar/Downloads/HB1000frames.csv"  # Replace with the actual file path
data = pd.read_csv(filepath)


In [19]:
# Filter the dataframe based on Timeframe
df = data.loc[data['Timeframe'] < 255100].copy()
df.rename(columns={"Timeframe": "time"}, inplace=True)

# Add a combined 'node' column
df['node'] = df.apply(lambda row: f"{row['Atom_Name']}_{row['Residue_ID']}", axis=1)
# Display the updated dataframe
df


Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,node
0,12.759892,2.253709,33.260902,C1,CSP,1,cb,255000,C1_1
1,12.862613,3.581455,34.023949,C2,CSP,1,cb,255000,C2_1
2,11.457548,4.321817,34.003315,C3,CSP,1,cb,255000,C3_1
3,10.981806,4.421790,32.537914,C4,CSP,1,cb,255000,C4_1
4,11.038748,3.091581,31.915064,O5,CSP,1,ob,255000,O5_1
...,...,...,...,...,...,...,...,...,...
612595,21.328222,68.733505,52.952137,H8,SFL,14,ha,255099,H8_14
612596,23.051521,68.403755,54.800755,H9,SFL,14,ha,255099,H9_14
612597,20.372837,61.473118,59.333237,H10,SFL,14,ha,255099,H10_14
612598,21.975138,58.681892,56.545349,H11,SFL,14,ha,255099,H11_14


In [20]:
#Select rows where Residue_ID is 5 and Atom_Type is either 'o' or 'os'
# df1 = df[(df['Residue_ID'] == 5) & df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# Select all O Atoms
# Select relevant atom types and residue names for calculations
df1 = df[df['Atom_Name'].str.startswith('O')].reset_index(drop=True)
df2 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'n')].reset_index(drop=True)
df3 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'hn')].reset_index(drop=True)

In [21]:
# Helper Functions
def calculate_angle(vec1, vec2):
    """Calculate angle between two vectors."""
    cos_theta = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    angle_rad = np.arccos(np.clip(cos_theta, -1.0, 1.0))
    return np.degrees(angle_rad)

def calculate_distance(row1, row2):
    """Calculate Euclidean distance."""
    return np.linalg.norm(np.array([row1.X, row1.Y, row1.Z]) - np.array([row2.X, row2.Y, row2.Z]))

def process_results(df):
    """Process DataFrame to add molecular IDs, unique nh_id, and labels."""
    df['src_mol'] = df['src'].apply(lambda x: int(x.split('_')[1]))
    df['dst_mol'] = df['dst'].apply(lambda x: int(x.split('_')[1]))
    df['nh_id'] = pd.factorize(df['dst'])[0] + 1
    df['label'] = np.where(
        (df['src_mol'].between(5, 14)) & (df['dst_mol'].between(1, 4)), 2, 1
    )
    return df

# Function to process DataFrame and save it to a CSV file
def process_and_save_dataframe(df, filename):
    """
    Process the input DataFrame to add 'ext_roll' and 'idx' columns.
    Convert all columns to integers before saving.
    Save the processed DataFrame to a specified CSV file.

    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - filename (str): Output file path
    """
    num_rows = len(df)

    # Initialize 'ext_roll' column with zeros
    df['ext_roll'] = 0

    # Assign 1 to the middle 15% rows and 2 to the last 15% rows
    df.loc[int(num_rows * 0.7):int(num_rows * 0.85) - 1, 'ext_roll'] = 1
    df.loc[int(num_rows * 0.85):, 'ext_roll'] = 2

    # Insert an 'idx' column at the beginning
    df.insert(0, 'idx', range(len(df)))

    # Reindex and retain only the required columns
    df = df[['idx', 'src', 'dst', 'time', 'label', 'ext_roll', 'nh_id']]

    # Convert all columns to integers
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Save the updated DataFrame to the specified CSV file
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

    return df

In [22]:
# Calculate angles and distances
distance_results = []
angle_results = []

for t in df1['time'].unique():
    df1_time = df1[df1['time'] == t].reset_index(drop=True)
    df2_time = df2[df2['time'] == t].reset_index(drop=True)
    df3_time = df3[df3['time'] == t].reset_index(drop=True)

    for row1 in df1_time.itertuples(index=False):
        for idx, row3 in df3_time.iterrows():
            if idx < len(df2_time):
                row2 = df2_time.loc[idx]

                # Vectors and calculations
                vec3_to_df1 = np.array([row1.X - row3.X, row1.Y - row3.Y, row1.Z - row3.Z])
                vec3_to_df2 = np.array([row2['X'] - row3.X, row2['Y'] - row3.Y, row2['Z'] - row3.Z])
                angle = calculate_angle(vec3_to_df1, vec3_to_df2)
                distance = calculate_distance(row1, row2)

                # Append results
                distance_results.append({'src': row1.node, 'dst': row2['node'], 'time': t, 'distance': distance})
                angle_results.append({'src': row1.node, 'dst': row3['node'], 'time': t, 'angle': angle})

# Convert results to DataFrames
dist_df = process_results(pd.DataFrame(distance_results))
angle_df = process_results(pd.DataFrame(angle_results))

# Display DataFrames
print(dist_df)
print(angle_df)

            src     dst    time   distance  src_mol  dst_mol  nh_id  label
0          O5_1   N11_1  255000   4.435484        1        1      1      1
1          O5_1   N23_1  255000   5.677049        1        1      2      1
2          O5_1   N35_1  255000   6.328246        1        1      3      1
3          O5_1   N55_1  255000   6.667776        1        1      4      1
4          O5_1   N67_1  255000   7.467732        1        1      5      1
...         ...     ...     ...        ...      ...      ...    ...    ...
12873595  O2_14  N727_4  255099  79.479108       14        4    212      2
12873596  O2_14  N739_4  255099  75.781733       14        4    213      2
12873597  O2_14  N759_4  255099  75.197622       14        4    214      2
12873598  O2_14  N771_4  255099  84.087391       14        4    215      2
12873599  O2_14  N783_4  255099  83.915797       14        4    216      2

[12873600 rows x 8 columns]
            src     dst    time       angle  src_mol  dst_mol  nh_id  l

In [23]:
# Filter and label the distance DataFrame
filtered_dist_df = dist_df[dist_df['distance'] <= 3.5].copy()
filtered_dist_df.reset_index(drop=True, inplace=True)

# Filter and label the angle DataFrame
filtered_angle_df = angle_df[(angle_df['angle'] >= 135) & (angle_df['angle'] < 180)].copy()
filtered_angle_df.reset_index(drop=True, inplace=True)


# Display the filtered DataFrames
print("Filtered Distance DataFrame:")
print(filtered_dist_df)

print("\nFiltered Angle DataFrame:")
print(filtered_angle_df)

Filtered Distance DataFrame:
          src     dst    time  distance  src_mol  dst_mol  nh_id  label
0        O5_1   N79_1  255000  3.043924        1        1      6      1
1        O8_1   N11_1  255000  2.339837        1        1      1      1
2       O10_1   N11_1  255000  2.352643        1        1      1      1
3       O10_1   N79_1  255000  3.010041        1        1      6      1
4       O20_1   N23_1  255000  2.250064        1        1      2      1
...       ...     ...     ...       ...      ...      ...    ...    ...
54293  O780_4  N783_4  255099  2.224148        4        4    216      1
54294  O782_4  N771_4  255099  3.358267        4        4    215      1
54295  O782_4  N783_4  255099  2.294585        4        4    216      1
54296   O2_11  N407_1  255099  2.896783       11        1     28      2
54297   O2_11  N475_1  255099  2.937560       11        1     33      2

[54298 rows x 8 columns]

Filtered Angle DataFrame:
           src     dst    time       angle  src_mol  d

In [24]:
# Find the intersection of rows based on 'time', 'src', and 'nh_id'
common_keys = pd.merge(
    filtered_dist_df[['time', 'src', 'nh_id']],
    filtered_angle_df[['time', 'src', 'nh_id']],
    on=['time', 'src', 'nh_id']
)

# Filter rows in filtered_dist_df based on the common keys
filtered_dist_df = filtered_dist_df.merge(common_keys, on=['time', 'src', 'nh_id'])

# Filter rows in filtered_angle_df based on the common keys
filtered_angle_df = filtered_angle_df.merge(common_keys, on=['time', 'src', 'nh_id'])

# Display the filtered DataFrames
print("Filtered Distance DataFrame:")
print(filtered_dist_df)

print("\nFiltered Angle DataFrame:")
print(filtered_angle_df)

Filtered Distance DataFrame:
         src     dst    time  distance  src_mol  dst_mol  nh_id  label
0      O10_1   N79_1  255000  3.010041        1        1      6      1
1     O184_1  N255_1  255000  3.252512        1        1     18      1
2     O210_1  N231_1  255000  2.668763        1        1     16      1
3     O284_1  N363_1  255000  3.411373        1        1     25      1
4     O406_1  N451_1  255000  3.048154        1        1     31      1
...      ...     ...     ...       ...      ...      ...    ...    ...
4155  O430_4  N451_4  255099  3.339441        4        4    193      1
4156  O518_4  N539_4  255099  3.117341        4        4    199      1
4157  O626_4  N695_4  255099  2.918977        4        4    210      1
4158   O2_11  N407_1  255099  2.896783       11        1     28      2
4159   O2_11  N475_1  255099  2.937560       11        1     33      2

[4160 rows x 8 columns]

Filtered Angle DataFrame:
         src     dst    time       angle  src_mol  dst_mol  nh_id  

In [25]:
# Combine src and dst values for consistent mapping
combined_values = pd.concat([
    filtered_dist_df['src'], filtered_dist_df['dst'], 
    filtered_angle_df['src'], filtered_angle_df['dst']
])

# Use factorize to assign numeric indices starting from 0
numeric_indices, _ = pd.factorize(combined_values)

# Add 1 to ensure indices start from 1
numeric_indices += 1

# Map src and dst directly to numeric indices using factorized output
mapping = pd.Series(numeric_indices, index=combined_values).to_dict()

# Replace src and dst with numeric indices starting from 1
for df in [filtered_dist_df, filtered_angle_df]:
    df['src'] = df['src'].map(mapping)
    df['dst'] = df['dst'].map(mapping)

# Display the filtered DataFrames
print("Filtered Distance DataFrame:")
print(filtered_dist_df)

print("\nFiltered Angle DataFrame:")
print(filtered_angle_df)

Filtered Distance DataFrame:
      src  dst    time  distance  src_mol  dst_mol  nh_id  label
0       1  101  255000  3.010041        1        1      6      1
1       2  102  255000  3.252512        1        1     18      1
2       3  103  255000  2.668763        1        1     16      1
3       4  104  255000  3.411373        1        1     25      1
4       5  105  255000  3.048154        1        1     31      1
...   ...  ...     ...       ...      ...      ...    ...    ...
4155   54  156  255099  3.339441        4        4    193      1
4156   36  136  255099  3.117341        4        4    199      1
4157   37  138  255099  2.918977        4        4    210      1
4158   39  140  255099  2.896783       11        1     28      2
4159   39  149  255099  2.937560       11        1     33      2

[4160 rows x 8 columns]

Filtered Angle DataFrame:
      src  dst    time       angle  src_mol  dst_mol  nh_id  label
0       1  188  255000  161.759281        1        1      6      1
1    

In [26]:
# Process and save distance DataFrame
final_dist_df = process_and_save_dataframe(filtered_dist_df, 'DATA/HB/edges1.csv')

# Process and save angle DataFrame
final_angle_df = process_and_save_dataframe(filtered_angle_df, 'DATA/HB/edges2.csv')

# Display the first few rows of both DataFrames
print("Updated Distance DataFrame:")
print(final_dist_df)

print("\nUpdated Angle DataFrame:")
print(final_angle_df)

Saved updated DataFrame to DATA/HB/edges1.csv
Saved updated DataFrame to DATA/HB/edges2.csv
Updated Distance DataFrame:
       idx  src  dst    time  label  ext_roll  nh_id
0        0    1  101  255000      1         0      6
1        1    2  102  255000      1         0     18
2        2    3  103  255000      1         0     16
3        3    4  104  255000      1         0     25
4        4    5  105  255000      1         0     31
...    ...  ...  ...     ...    ...       ...    ...
4155  4155   54  156  255099      1         2    193
4156  4156   36  136  255099      1         2    199
4157  4157   37  138  255099      1         2    210
4158  4158   39  140  255099      2         2     28
4159  4159   39  149  255099      2         2     33

[4160 rows x 7 columns]

Updated Angle DataFrame:
       idx  src  dst    time  label  ext_roll  nh_id
0        0    1  188  255000      1         0      6
1        1    2  189  255000      1         0     18
2        2    3  190  255000      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).a

In [None]:
from sklearn.preprocessing import OrdinalEncoder
new_dist_df = filtered_dist_df.copy()
new_angle_df = filtered_angle_df.copy()
# Combine src and dst columns from both DataFrames to ensure consistent mapping
combined_values = pd.concat([
    new_dist_df [['src', 'dst']],
    new_angle_df[['src', 'dst']]
])

# Fit an OrdinalEncoder on the combined src and dst values
encoder = OrdinalEncoder(dtype=int)
encoder.fit(combined_values)

# Apply the encoder to the src and dst columns in both DataFrames
new_dist_df[['src', 'dst']] = encoder.transform(new_dist_df[['src', 'dst']]) + 1  # Start from 1
new_angle_df[['src', 'dst']] = encoder.transform(new_angle_df[['src', 'dst']]) + 1  # Start from 1

# Display the updated DataFrames
print("Updated new_dist_df:")
print(new_dist_df)

print("\nUpdated new_angle_df:")
print(new_angle_df)

In [None]:
# Process and save distance DataFrame
renew_dist_df = process_and_save_dataframe(new_dist_df, 'DATA/HB/edges3.csv')

# Process and save angle DataFrame
renew_angle_df = process_and_save_dataframe(new_angle_df, 'DATA/HB/edges4.csv')

# Display the first few rows of both DataFrames
print("Updated Distance DataFrame:")
print(renew_dist_df)

print("\nUpdated Angle DataFrame:")
print(renew_angle_df)

In [27]:
# # Extract src and dst column features
# src_feats = filtered_df[['src', 'src_mol']].rename(columns={'src': 'node', 'src_mol': 'mol'})
# dst_feats = filtered_df[['dst', 'dst_mol']].rename(columns={'dst': 'node', 'dst_mol': 'mol'})

# # Concatenate src and dst features
# node_feats = pd.concat([src_feats, dst_feats])

# # Drop duplicates
# node_feats = node_feats.drop_duplicates().reset_index(drop=True)

# # Save to CSV
# node_feats.to_csv('DATA/HB/node_feats.csv', index=False)

# # Display the node features
# print(node_feats)

In [28]:
# # Encode nodes into unique integers
# le_node = LabelEncoder()

# # Fit the encoder on the combined 'src' and 'dst' columns from both dataframes
# all_nodes = pd.concat([filtered_dist_df['src'], filtered_dist_df['dst'], filtered_angle_df['src'], filtered_angle_df['dst']])
# le_node.fit(all_nodes)

# # Transform the 'src' and 'dst' columns in both dataframes
# filtered_dist_df['src'] = le_node.transform(filtered_dist_df['src'])
# filtered_dist_df['dst'] = le_node.transform(filtered_dist_df['dst']) + filtered_dist_df['src'].max() + 1

# filtered_angle_df['src'] = le_node.transform(filtered_angle_df['src'])
# filtered_angle_df['dst'] = le_node.transform(filtered_angle_df['dst']) + filtered_angle_df['src'].max() + 1

# # Display the first few rows of each dataframe
# print(filtered_dist_df)
# print(filtered_angle_df)

In [29]:
# # Get unique nodes from node_1, node_2, and node_3 columns
# unique_nodes_1 = filtered_df['src'].unique()
# unique_nodes_2 = filtered_df['dst'].unique()

# # Assign a color for each group (e.g., tab10 colormap)
# color_1 = plt.cm.tab10(0)  # Color for node_1 group
# color_2 = plt.cm.tab10(1)  # Color for node_2 group
# color_3 = plt.cm.tab10(2)  # Color for node_3 group

# # Create a combined color map based on the node groups
# node_colors = {}
# for node in unique_nodes_1:
#     node_colors[node] = color_1
# for node in unique_nodes_2:
#     node_colors[node] = color_2


# # Create a graph
# G = nx.Graph()

# # Add edges between node_1 and node_2
# edges_1_2 = filtered_df[['node_1', 'node_2']].values.tolist()
# G.add_edges_from(edges_1_2)

# # Add edges between node_1 and node_3
# edges_1_3 = filtered_df[['node_1', 'node_3']].values.tolist()
# G.add_edges_from(edges_1_3)

# # Add edges between node_2 and node_3 (these will be bold)
# edges_2_3 = filtered_df[['node_2', 'node_3']].values.tolist()
# G.add_edges_from(edges_2_3)

# # Define the layout for the graph
# pos = nx.spring_layout(G)

# # Separate edges into normal and bold
# normal_edges = edges_1_2 + edges_1_3
# bold_edges = edges_2_3

# # Plot the graph
# plt.figure(figsize=(8, 4))
# nx.draw(
#     G,
#     pos,
#     with_labels=True,
#     node_size=500,
#     node_color=[node_colors[node] for node in G.nodes()],
#     font_size=10,
#     font_weight='bold',
#     edge_color='gray',
#     edgelist=normal_edges,
#     width=1  # Normal edges width
# )

# # Draw the bold edges separately
# nx.draw_networkx_edges(
#     G,
#     pos,
#     edgelist=bold_edges,
#     width=2,  # Bold edges width
#     edge_color='black'
# )

# plt.title('Graph')
# plt.show()
