In [1]:
# %%
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import LabelEncoder

In [2]:

dist_df = pd.read_csv("DATA/HB/dist_df_100time.csv")
angle_df = pd.read_csv("DATA/HB/angle_df_100time.csv")

In [3]:
dist_df

Unnamed: 0,src,dst,time,distance,src_mol,dst_mol,nh_id,label
0,O5_1,N11_1,255000,4.435484,1,1,1,1
1,O5_1,N23_1,255000,5.677049,1,1,2,1
2,O5_1,N35_1,255000,6.328246,1,1,3,1
3,O5_1,N55_1,255000,6.667776,1,1,4,1
4,O5_1,N67_1,255000,7.467732,1,1,5,1
...,...,...,...,...,...,...,...,...
12873595,O2_14,N727_4,255099,79.479108,14,4,212,2
12873596,O2_14,N739_4,255099,75.781733,14,4,213,2
12873597,O2_14,N759_4,255099,75.197622,14,4,214,2
12873598,O2_14,N771_4,255099,84.087391,14,4,215,2


In [4]:
angle_df

Unnamed: 0,src,dst,time,angle,src_mol,dst_mol,nh_id,label
0,O5_1,H9_1,255000,61.040505,1,1,1,1
1,O5_1,H19_1,255000,73.524710,1,1,2,1
2,O5_1,H29_1,255000,88.194570,1,1,3,1
3,O5_1,H46_1,255000,120.540965,1,1,4,1
4,O5_1,H56_1,255000,115.887955,1,1,5,1
...,...,...,...,...,...,...,...,...
12873595,O2_14,H611_4,255099,166.764098,14,4,212,2
12873596,O2_14,H621_4,255099,62.206380,14,4,213,2
12873597,O2_14,H638_4,255099,99.980679,14,4,214,2
12873598,O2_14,H648_4,255099,137.762045,14,4,215,2


In [5]:
dist_df['time'] -= 255000
angle_df['time'] -= 255000


In [6]:
# Filter DataFrames
filtered_dist_df = dist_df[dist_df['distance'] <= 3.5].reset_index(drop=True)
filtered_angle_df = angle_df[(angle_df['angle'] >= 135) & (angle_df['angle'] < 180)].reset_index(drop=True)


# Find common keys
common_keys = pd.merge(
    filtered_dist_df[['time', 'src', 'nh_id']],
    filtered_angle_df[['time', 'src', 'nh_id']],
    on=['time', 'src', 'nh_id']
)

filtered_dist_df = filtered_dist_df.merge(common_keys, on=['time', 'src', 'nh_id'])
filtered_angle_df = filtered_angle_df.merge(common_keys, on=['time', 'src', 'nh_id'])

# Display the filtered DataFrames
print("Filtered Distance DataFrame:")
print(filtered_dist_df)

print("\nFiltered Angle DataFrame:")
print(filtered_angle_df)

Filtered Distance DataFrame:
         src     dst  time  distance  src_mol  dst_mol  nh_id  label
0      O10_1   N79_1     0  3.010041        1        1      6      1
1     O184_1  N255_1     0  3.252512        1        1     18      1
2     O210_1  N231_1     0  2.668763        1        1     16      1
3     O284_1  N363_1     0  3.411373        1        1     25      1
4     O406_1  N451_1     0  3.048154        1        1     31      1
...      ...     ...   ...       ...      ...      ...    ...    ...
4155  O430_4  N451_4    99  3.339441        4        4    193      1
4156  O518_4  N539_4    99  3.117341        4        4    199      1
4157  O626_4  N695_4    99  2.918977        4        4    210      1
4158   O2_11  N407_1    99  2.896783       11        1     28      2
4159   O2_11  N475_1    99  2.937560       11        1     33      2

[4160 rows x 8 columns]

Filtered Angle DataFrame:
         src     dst  time       angle  src_mol  dst_mol  nh_id  label
0      O10_1   H66_1

In [7]:
def map_numeric_indices(dist_df, angle_df):
    """
    Map 'src' and 'dst' columns to numeric indices starting from 1 for consistent mapping.

    Parameters:
    - dist_df (pd.DataFrame): Distance DataFrame
    - angle_df (pd.DataFrame): Angle DataFrame

    Returns:
    - dist_df (pd.DataFrame): Updated Distance DataFrame with numeric indices
    - angle_df (pd.DataFrame): Updated Angle DataFrame with numeric indices
    """
    # Combine src and dst values for consistent mapping
    combined_values = pd.concat([
        dist_df['src'], dist_df['dst'], 
        angle_df['src'], angle_df['dst']
    ])

    # Use factorize to assign numeric indices starting from 0
    numeric_indices, _ = pd.factorize(combined_values)

    # Add 1 to ensure indices start from 1
    numeric_indices += 1

    # Map src and dst directly to numeric indices using factorized output
    mapping = pd.Series(numeric_indices, index=combined_values).to_dict()

    # Replace src and dst with numeric indices starting from 1
    for df in [dist_df, angle_df]:
        df['src'] = df['src'].map(mapping)
        df['dst'] = df['dst'].map(mapping)
    
    return dist_df, angle_df


dist_df, angle_df = map_numeric_indices(dist_df, angle_df)


# Display the updated DataFrames
print("Distance DataFrame:")
print(dist_df)

print("\nAngle DataFrame:")
print(angle_df)



Updated Distance DataFrame:
          src  dst  time   distance  src_mol  dst_mol  nh_id  label
0           1  597     0   4.435484        1        1      1      1
1           1  598     0   5.677049        1        1      2      1
2           1  599     0   6.328246        1        1      3      1
3           1  600     0   6.667776        1        1      4      1
4           1  601     0   7.467732        1        1      5      1
...       ...  ...   ...        ...      ...      ...    ...    ...
12873595  596  808    99  79.479108       14        4    212      2
12873596  596  809    99  75.781733       14        4    213      2
12873597  596  810    99  75.197622       14        4    214      2
12873598  596  811    99  84.087391       14        4    215      2
12873599  596  812    99  83.915797       14        4    216      2

[12873600 rows x 8 columns]

Updated Angle DataFrame:
          src   dst  time       angle  src_mol  dst_mol  nh_id  label
0           1   813     0   61.

In [14]:
filtered_dist_df, filtered_angle_df = map_numeric_indices(filtered_dist_df, filtered_angle_df)

# Display the updated DataFrames
print("Updated Distance DataFrame:")
print(filtered_dist_df)

print("\nUpdated Angle DataFrame:")
print(filtered_angle_df)

Updated Distance DataFrame:
       idx  src  dst  time  distance  src_mol  dst_mol  nh_id  label  ext_roll
0        0    1  101     0  3.010041        1        1      6      1         0
1        1    2  102     0  3.252512        1        1     18      1         0
2        2    3  103     0  2.668763        1        1     16      1         0
3        3    4  104     0  3.411373        1        1     25      1         0
4        4    5  105     0  3.048154        1        1     31      1         0
...    ...  ...  ...   ...       ...      ...      ...    ...    ...       ...
4155  4155   54  156    99  3.339441        4        4    193      1         2
4156  4156   36  136    99  3.117341        4        4    199      1         2
4157  4157   37  138    99  2.918977        4        4    210      1         2
4158  4158   39  140    99  2.896783       11        1     28      2         2
4159  4159   39  149    99  2.937560       11        1     33      2         2

[4160 rows x 10 columns

In [8]:
# Function to process DataFrame and save it to a CSV file
def process_and_save_dataframe(df, filename):
    """
    Process the input DataFrame to add 'ext_roll' and 'idx' columns.
    Convert all columns to integers before saving.
    Save the processed DataFrame to a specified CSV file.

    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - filename (str): Output file path
    """
    num_rows = len(df)

    # Initialize 'ext_roll' column with zeros
    df['ext_roll'] = 0

    # Assign 1 to the middle 15% rows and 2 to the last 15% rows
    df.loc[int(num_rows * 0.7):int(num_rows * 0.85) - 1, 'ext_roll'] = 1
    df.loc[int(num_rows * 0.85):, 'ext_roll'] = 2

    # Insert an 'idx' column at the beginning
    df.insert(0, 'idx', range(len(df)))

    # Reindex and retain only the required columns
    df = df[['idx', 'src', 'dst', 'time', 'label', 'ext_roll', 'nh_id']]

    # # Convert all columns to integers
    # for col in df.columns:
    #     df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Save the updated DataFrame to the specified CSV file
    df.to_csv(filename, index=False)
    print(f"Saved updated DataFrame to {filename}")

    return df

In [9]:

process_and_save_dataframe(dist_df, 'DATA/HB/test_edges1.csv')
process_and_save_dataframe(angle_df, 'DATA/HB/test_edges2.csv')

process_and_save_dataframe(filtered_dist_df, 'DATA/HB/filtered_edges1.csv')
process_and_save_dataframe(filtered_angle_df, 'DATA/HB/filtered_edges2.csv')


Saved updated DataFrame to DATA/HB/test_edges1.csv
Saved updated DataFrame to DATA/HB/test_edges2.csv
Saved updated DataFrame to DATA/HB/filtered_edges1.csv
Saved updated DataFrame to DATA/HB/filtered_edges2.csv


Unnamed: 0,idx,src,dst,time,label,ext_roll,nh_id
0,0,1,188,0,1,0,6
1,1,2,189,0,1,0,18
2,2,3,190,0,1,0,16
3,3,4,191,0,1,0,25
4,4,5,192,0,1,0,31
...,...,...,...,...,...,...,...
4155,4155,54,243,99,1,2,193
4156,4156,36,223,99,1,2,199
4157,4157,37,225,99,1,2,210
4158,4158,39,227,99,2,2,28


In [12]:
final_dist = pd.read_csv("DATA/HB/test_edges1.csv")
final_angle = pd.read_csv("DATA/HB/test_edges2.csv")

# Display the first few rows of both DataFrames
print("Final Distance DataFrame:")
print(final_dist)

print("\nFinal Angle DataFrame:")
print(final_angle)


Updated Distance DataFrame:
               idx  src  dst  time  label  ext_roll  nh_id
0                0    1  597     0      1         0      1
1                1    1  598     0      1         0      2
2                2    1  599     0      1         0      3
3                3    1  600     0      1         0      4
4                4    1  601     0      1         0      5
...            ...  ...  ...   ...    ...       ...    ...
12873595  12873595  596  808    99      2         2    212
12873596  12873596  596  809    99      2         2    213
12873597  12873597  596  810    99      2         2    214
12873598  12873598  596  811    99      2         2    215
12873599  12873599  596  812    99      2         2    216

[12873600 rows x 7 columns]

Updated Angle DataFrame:
               idx  src   dst  time  label  ext_roll  nh_id
0                0    1   813     0      1         0      1
1                1    1   814     0      1         0      2
2                2    1   815

In [13]:
final_filt_dist = pd.read_csv("DATA/HB/filtered_edges1.csv")
final_filt_angle = pd.read_csv("DATA/HB/filtered_edges2.csv")
# Display the first few rows of both DataFrames
print("Filtered Distance DataFrame:")
print(final_filt_dist)

print("\nFiltered Angle DataFrame:")
print(final_filt_angle)

Filtered Distance DataFrame:
       idx  src  dst  time  label  ext_roll  nh_id
0        0    1  101     0      1         0      6
1        1    2  102     0      1         0     18
2        2    3  103     0      1         0     16
3        3    4  104     0      1         0     25
4        4    5  105     0      1         0     31
...    ...  ...  ...   ...    ...       ...    ...
4155  4155   54  156    99      1         2    193
4156  4156   36  136    99      1         2    199
4157  4157   37  138    99      1         2    210
4158  4158   39  140    99      2         2     28
4159  4159   39  149    99      2         2     33

[4160 rows x 7 columns]

Filtered Angle DataFrame:
       idx  src  dst  time  label  ext_roll  nh_id
0        0    1  188     0      1         0      6
1        1    2  189     0      1         0     18
2        2    3  190     0      1         0     16
3        3    4  191     0      1         0     25
4        4    5  192     0      1         0     31
.

In [None]:
# unique_src_numbers = final_angle['src'].unique()
# print(len(unique_src_numbers))

596
