In [None]:
# %%
import numpy as np
import pandas as pd
import torch
import logging

In [4]:

# Example DataFrame with columns X, Y, Z, Atom_Name, Residue_Name, Residue_ID, Atom_Type, and Timeframe
df = pd.read_csv("HB1000frames.csv")
df.rename(columns={"Timeframe": "time"}, inplace=True)
df['time'] = df['time'] - df['time'].min()
df['elem'] = df.apply(lambda row: f"{row['Atom_Name']}_{row['Residue_ID']}", axis=1)
df



Unnamed: 0,X,Y,Z,Atom_Name,Residue_Name,Residue_ID,Atom_Type,time,elem
0,12.759892,2.253709,33.260902,C1,CSP,1.0,cb,0.0,C1_1.0
1,12.862613,3.581455,34.023949,C2,CSP,1.0,cb,0.0,C2_1.0
2,11.457548,4.321817,34.003315,C3,CSP,1.0,cb,0.0,C3_1.0
3,10.981806,4.421790,32.537914,C4,CSP,1.0,cb,0.0,C4_1.0
4,11.038748,3.091581,31.915064,O5,CSP,1.0,ob,0.0,O5_1.0
...,...,...,...,...,...,...,...,...,...
1094121,41.351189,-7.435875,40.460285,C776,CSP,3.0,ca,178.0,C776_3.0
1094122,40.812626,-7.301743,39.170258,C777,CSP,3.0,ca,178.0,C777_3.0
1094123,40.096668,-8.484385,38.549259,C778,CSP,3.0,c3,178.0,C778_3.0
1094124,41.000370,-6.134412,38.372360,C779,CSP,3.0,ca,178.0,C779_3.0


In [5]:
#Select rows where Residue_ID is 5 and Atom_Type is either 'o' or 'os'
# df1 = df[(df['Residue_ID'] == 5) & df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# Select all O Atoms
# Select relevant atom types and residue names for calculations
df1 = df[df['Atom_Type'].isin(['o', 'os'])].reset_index(drop=True)
# df1 = df[df['Atom_Name'].str.startswith('O')].reset_index(drop=True)
df2 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'n')].reset_index(drop=True)
df3 = df[(df['Residue_Name'] == 'CSP') & (df['Atom_Type'] == 'hn')].reset_index(drop=True)

In [7]:
def calculate_distance(row1, row2):
    """
    Calculate Euclidean distance between two rows based on X, Y, Z coordinates.
    
    Parameters:
        row1, row2: objects (like a pandas Series or namedtuple) with attributes X, Y, Z.
    
    Returns:
        Euclidean distance (float).
    """
    coord1 = np.array([row1.X, row1.Y, row1.Z])
    coord2 = np.array([row2.X, row2.Y, row2.Z])
    return np.linalg.norm(coord1 - coord2)

def calculate_angle(vec1, vec2):
    """
    Calculate the angle between two vectors (in degrees).
    
    Parameters:
        vec1, vec2: numpy arrays representing the vectors.
    
    Returns:
        Angle between vec1 and vec2 in degrees (float).
    """
    # Compute cosine of the angle using the dot product
    cos_theta = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    # Clip the value to avoid any numerical issues outside the valid range for arccos
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    angle_rad = np.arccos(cos_theta)
    return np.degrees(angle_rad)


In [9]:
# Calculate angles and distances
distance_results = []
angle_results = []

for t in df1['time'].unique():
    # Filter rows for the current time:
    # For df1, we reset the index (we don't need its original index);
    # For df2 and df3, we keep the original index by not dropping it.
    df1_time = df1[df1['time'] == t].reset_index(drop=True)
    df2_time = df2[df2['time'] == t].reset_index()   # keeps original index in 'index'
    df3_time = df3[df3['time'] == t].reset_index()   # keeps original index in 'index'
    
    # For each row in df1, pair each row in df2 and df3 based on index
    for row1 in df1_time.itertuples(index=False):
        for row2, row3 in zip(df2_time.itertuples(), df3_time.itertuples()):
            # Calculate Euclidean distance between row1 (from df1) and row2 (from df2)
            distance = calculate_distance(row1, row2)
            
            # Calculate vectors from row3 (from df3) to row1 and row2
            vec3_to_df1 = np.array([row1.X - row3.X, row1.Y - row3.Y, row1.Z - row3.Z])
            vec3_to_df2 = np.array([row2.X - row3.X, row2.Y - row3.Y, row2.Z - row3.Z])
            angle = calculate_angle(vec3_to_df1, vec3_to_df2)
            
            # Append distance results if condition is met (distance <= 3.5)
            if distance <= 3.5:
                distance_results.append({
                    'source': row1.elem,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst': row2.elem,
                    'dst_x': row2.X,
                    'dst_y': row2.Y,
                    'dst_z': row2.Z,
                    'dst_mol': row2.Residue_ID,
                    'dst_idx': row2.Index,  
                    'time': t,
                    'distance': distance
                })
            # Append angle results if condition is met (135 <= angle < 180)
            if 135 <= angle < 180:
                angle_results.append({
                    'source': row1.elem,
                    'src_x': row1.X,
                    'src_y': row1.Y,
                    'src_z': row1.Z,
                    'src_mol': row1.Residue_ID,
                    'dst': row3.elem,
                    'dst_x': row3.X,
                    'dst_y': row3.Y,
                    'dst_z': row3.Z,
                    'dst_mol': row3.Residue_ID,
                    'dst_idx': row3.Index,  
                    'time': t,
                    'angle': angle
                })

# Convert your results lists into DataFrames
df_distance = pd.DataFrame(distance_results)
df_angle = pd.DataFrame(angle_results)

In [10]:
df_distance

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance
0,O8_1.0,12.433211,1.435311,29.606905,1.0,N11_1.0,12.264816,2.733424,27.667477,1.0,0,0.0,2.339837
1,O10_1.0,10.361735,2.108788,28.901592,1.0,N11_1.0,12.264816,2.733424,27.667477,1.0,0,0.0,2.352643
2,O10_1.0,10.361735,2.108788,28.901592,1.0,N79_1.0,8.222926,3.125758,30.759445,1.0,5,0.0,3.010041
3,O20_1.0,11.626869,5.605087,34.612240,1.0,N23_1.0,10.870185,7.574643,35.393948,1.0,1,0.0,2.250064
4,O22_1.0,9.349302,5.952352,34.952969,1.0,N23_1.0,10.870185,7.574643,35.393948,1.0,1,0.0,2.267019
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94044,O758_2.0,21.342960,70.349312,28.589434,2.0,N759_2.0,20.925453,69.299072,26.616503,2.0,105,178.0,2.273713
94045,O768_2.0,25.819065,75.317734,23.585480,2.0,N771_2.0,26.655825,77.417595,23.383596,2.0,106,178.0,2.269436
94046,O770_2.0,24.377110,77.202492,23.360914,2.0,N771_2.0,26.655825,77.417595,23.383596,2.0,106,178.0,2.288958
94047,O780_2.0,27.047836,73.600883,25.792545,2.0,N783_2.0,28.891222,74.527580,26.406046,2.0,107,178.0,2.152492


In [11]:
df_angle

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle
0,O8_1.0,12.433211,1.435311,29.606905,1.0,H46_1.0,15.508764,3.906857,27.851604,1.0,3,0.0,150.071960
1,O8_1.0,12.433211,1.435311,29.606905,1.0,H66_1.0,8.854777,2.591426,30.175131,1.0,5,0.0,146.449027
2,O8_1.0,12.433211,1.435311,29.606905,1.0,H83_1.0,8.434090,7.601198,32.716743,1.0,6,0.0,158.491808
3,O8_1.0,12.433211,1.435311,29.606905,1.0,H177_1.0,6.069104,16.899189,34.597523,1.0,14,0.0,148.473702
4,O8_1.0,12.433211,1.435311,29.606905,1.0,H194_1.0,7.083150,21.469265,33.176544,1.0,15,0.0,161.601357
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2468009,O770_3.0,41.562897,-4.400577,35.964375,3.0,H510_2.0,31.513956,56.855240,26.840006,2.0,95,178.0,169.549087
2468010,O770_3.0,41.562897,-4.400577,35.964375,3.0,H527_2.0,22.770149,57.246090,31.844971,2.0,96,178.0,145.626246
2468011,O770_3.0,41.562897,-4.400577,35.964375,3.0,H564_2.0,28.084663,64.211357,31.509163,2.0,99,178.0,158.588947
2468012,O770_3.0,41.562897,-4.400577,35.964375,3.0,H584_2.0,21.098440,62.045410,29.847916,2.0,101,178.0,164.938515


In [12]:
#Hasan
# # Define the key columns used for matching

keys = ["source","dst_idx","time"]

# 1) Build the union of all key‐triplets, count how many times they appear
union = pd.concat([
    df_distance[keys].drop_duplicates(),
    df_angle   [keys].drop_duplicates()
], ignore_index=True)

counts = union.groupby(keys).size().reset_index(name="count")

# 2) Label = 1 if count==2 (appears in both), else 0
counts["label"] = (counts["count"]==2).astype(int)

# 3) Merge back into each DataFrame
df_distance_common = df_distance.merge(counts[keys+["label"]], on=keys, how="left")
df_angle_common    = df_angle.merge(counts[keys+["label"]], on=keys, how="left")

df_distance_common["label"] = df_distance_common["label"].fillna(0).astype(int)
df_angle_common   ["label"] = df_angle_common   ["label"].fillna(0).astype(int)

df_distance_common = df_distance_common.reset_index(drop=True)
df_angle_common    = df_angle_common.reset_index(drop=True)

print(df_distance_common["label"].value_counts())
print(df_angle_common   ["label"].value_counts())

pos_df_distance = df_distance_common[df_distance_common["label"]==1]
neg_df_distance = df_distance_common[df_distance_common["label"]==0]

# pick up to N/2 from each
N = min(len(pos_df_distance), len(neg_df_distance))
pos_dist_samp = pos_df_distance.sample(n=N, random_state=0)
neg_dist_samp = neg_df_distance.sample(n=N, random_state=1)


df_distance_final = pd.concat([pos_dist_samp, neg_dist_samp]).sample(frac=1, random_state=2).reset_index(drop=True)

# For angle:
pos_df_angle = df_angle_common[df_angle_common["label"] == 1]
neg_df_angle = df_angle_common[df_angle_common["label"] == 0]


# Now sample from angle
pos_ang_samp = pos_df_angle.sample(n=N, random_state=0)
neg_ang_samp = neg_df_angle.sample(n=N, random_state=1)

# Concatenate and shuffle
df_angle_final = (
    pd.concat([pos_ang_samp, neg_ang_samp])
      .sample(frac=1, random_state=2)
      .reset_index(drop=True)
)

# Quick sanity check
print("Distance final shape:", df_distance_final.shape)
print("Angle    final shape:", df_angle_final.shape)
print("Distance label counts:\n", df_distance_final["label"].value_counts())
print("Angle    label counts:\n", df_angle_final   ["label"].value_counts())

label
0    86727
1     7322
Name: count, dtype: int64
label
0    2460692
1       7322
Name: count, dtype: int64
Distance final shape: (14644, 14)
Angle    final shape: (14644, 14)
Distance label counts:
 label
0    7322
1    7322
Name: count, dtype: int64
Angle    label counts:
 label
0    7322
1    7322
Name: count, dtype: int64


In [76]:
# Combine src and dst values for consistent mapping
combined_values = pd.concat([
    df_distance_final['source'], df_distance_final['dst'], 
    df_angle_final['source'], df_angle_final['dst']
])

# Use factorize to assign numeric indices starting from 0
numeric_indices, _ = pd.factorize(combined_values)

# Map src and dst directly to numeric indices using factorized output
mapping = pd.Series(numeric_indices, index=combined_values).to_dict()


for df in [df_distance_final, df_angle_final]:
    df['src'] = df['source'].map(mapping)
    df['dst'] = df['dst'].map(mapping)





In [14]:
df_distance_final

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,distance,label,src
0,O286_1.0,10.087707,29.131914,29.709049,1.0,452,10.695017,29.637152,27.633337,1.0,19,56.0,2.220962,0,0
1,O668_3.0,44.276184,8.112521,32.277905,3.0,453,45.380405,6.847574,33.669685,3.0,153,12.0,2.180929,0,1
2,O372_2.0,23.610010,37.359432,31.614174,2.0,454,23.328083,37.663666,33.788914,2.0,79,77.0,2.213941,0,2
3,O296_1.0,8.632976,26.654459,26.606960,1.0,455,7.381982,28.780010,24.208776,1.0,21,155.0,3.440093,0,3
4,O186_3.0,40.040352,52.975300,32.031624,3.0,456,38.797550,50.223488,32.726196,3.0,125,110.0,3.098299,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14639,O448_4.0,60.069576,31.503553,32.440197,4.0,559,60.283894,31.612774,34.710838,4.0,192,67.0,2.283347,0,144
14640,O694_2.0,20.578228,65.332428,28.653940,2.0,473,22.197504,67.595795,30.246788,2.0,102,159.0,3.206563,0,165
14641,O474_3.0,49.342045,27.067738,30.555721,3.0,479,47.609482,25.552464,32.701405,3.0,141,28.0,3.146710,1,38
14642,O2_11.0,7.834811,37.225082,34.073071,11.0,500,8.923358,39.520660,32.818012,1.0,32,40.0,2.833687,1,51


In [15]:
df_angle_final

Unnamed: 0,source,src_x,src_y,src_z,src_mol,dst,dst_x,dst_y,dst_z,dst_mol,dst_idx,time,angle,label,src
0,O736_3.0,42.010494,6.448990,33.834686,3.0,668,33.371723,20.177452,28.259638,2.0,66,156.0,149.336417,0,209
1,O228_1.0,8.192220,22.554585,31.047207,1.0,669,10.780736,12.612014,26.974632,1.0,8,68.0,138.051795,0,249
2,O406_1.0,3.509235,37.836929,31.248281,1.0,670,7.420165,49.074184,34.000916,1.0,36,72.0,165.846307,0,8
3,O406_3.0,48.537575,33.689533,27.678022,3.0,671,43.702091,4.391780,27.674328,3.0,156,164.0,136.518335,0,9
4,O186_3.0,40.040352,52.975300,32.031624,3.0,672,39.449959,50.961864,32.961418,3.0,125,110.0,135.323031,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14639,O360_2.0,28.559193,35.629856,28.558558,2.0,728,41.390923,56.849106,31.396790,3.0,117,114.0,151.475764,0,212
14640,O694_1.0,2.950868,64.435822,30.358313,1.0,728,40.913387,56.920113,31.264936,3.0,117,76.0,151.702111,0,364
14641,O474_3.0,49.342045,27.067738,30.555721,3.0,735,48.263931,25.563564,31.928268,3.0,141,28.0,139.827253,1,38
14642,O2_11.0,7.834811,37.225082,34.073071,11.0,714,8.213047,38.812107,32.957954,1.0,32,40.0,140.704352,1,51


In [16]:


def build_node_mapping(dfs, src_col='source', dst_col='dst'):
    """
    Given a list of DataFrames, returns a pd.Series mapping each unique node
    label in src_col or dst_col to a 0-based integer ID.
    """
    unique_nodes = pd.Index(
        pd.concat([df[src_col] for df in dfs] + [df[dst_col] for df in dfs])
          .unique()
    )
    return pd.Series(data=range(len(unique_nodes)), index=unique_nodes, name='node_id')

def remap_edges(df, mapping, src_col='source', dst_col='dst'):
    """
    Adds two new columns 'src' and 'dst' to df, mapping the original labels to ints.
    """
    df['src'] = df[src_col].map(mapping).astype(int)
    df['dst'] = df[dst_col].map(mapping).astype(int)

def generate_node_features(df1, df2, mapping, filename):
    """
    Extracts 'src_mol' and 'dst_mol' from both df1 & df2, merges them on the
    global node_id space (given by mapping), fills missing feats with 0,
    and writes a [1 x num_nodes] LongTensor to `filename`.
    """
    def collect(df, col, feat_col):
        tmp = (
            df[[col, feat_col]]
            .drop_duplicates()
            .rename(columns={col: 'node', feat_col: 'feat'})
        )
        tmp['node_id'] = tmp['node'].map(mapping).astype(int)
        return tmp[['node_id', 'feat']]

    f1 = collect(df1, 'source', 'src_mol')
    f2 = collect(df1, 'dst',    'dst_mol')
    f3 = collect(df2, 'source', 'src_mol')
    f4 = collect(df2, 'dst',    'dst_mol')

    feats = pd.concat([f1, f2, f3, f4], ignore_index=True)
    feats = feats.drop_duplicates(subset='node_id', keep='first')

    num_nodes = len(mapping)
    all_feat = pd.DataFrame({'node_id': range(num_nodes)})
    all_feat = all_feat.merge(feats, on='node_id', how='left')
    all_feat['feat'] = all_feat['feat'].fillna(0).astype(int)

    tensor = torch.tensor(all_feat['feat'].values, dtype=torch.long).unsqueeze(0)
    torch.save(tensor, filename)
    logging.info(f"Saved node features ({tensor.shape}) to {filename}")
    return tensor

def process_and_save_dataframe(df, filename):
    """
    Repackages df to columns [idx, src, dst, time, label, dst_idx, ext_roll],
    sorts by time, splits ext_roll into thirds (0/1/2), and writes to CSV.
    """
    df = df[['src','dst','time','label','dst_idx']].copy()
    df = df.sort_values('time', ascending=True, ignore_index=True)

    N = len(df)
    df.insert(0, 'idx', range(N))
    df['ext_roll'] = 0

    # assign 1 to the middle 15%, 2 to the last 15%
    df.loc[int(N*0.70):int(N*0.85), 'ext_roll'] = 1
    df.loc[int(N*0.85):,           'ext_roll'] = 2

    print(df['ext_roll'].value_counts(normalize=True))
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

# 1) build the mapping off your original DataFrames
mapping = build_node_mapping(
    [df_distance_final, df_angle_final],
    src_col='source', dst_col='dst',
)

# 2) immediately generate node_features from the originals
node_feats = generate_node_features(
    df_distance_final,
    df_angle_final,
    mapping,
    filename='node_features.pt'
)

# 3) *then* remap your edges
remap_edges(df_distance_final, mapping, src_col='source', dst_col='dst')
remap_edges(df_angle_final,    mapping, src_col='source', dst_col='dst')

# 4) finally write out your CSVs
process_and_save_dataframe(df_distance_final, 'edges1.csv')
process_and_save_dataframe(df_angle_final,    'edges2.csv')

ext_roll
0    0.699945
1    0.150027
2    0.150027
Name: proportion, dtype: float64
Saved updated DataFrame to edges1.csv
ext_roll
0    0.699945
1    0.150027
2    0.150027
Name: proportion, dtype: float64
Saved updated DataFrame to edges2.csv
