In [5]:
import os
import pandas as pd

def concatenate_saved_data(directory):
    csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    csv_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
    data_frames = [pd.read_csv(f) for f in csv_files]
    concatenated_data = pd.concat(data_frames, ignore_index=True)
    concatenated_data['day'] = concatenated_data['day'] + 1  # Count days from 1
    return concatenated_data

similarity_data = concatenate_saved_data('data/preprocessed/daily_ks_scores/')

print(f'Similarity Data: {similarity_data.shape[0]} edges')
print(f'Days: {sorted(similarity_data["day"].unique())}')

Similarity Data: 12598529 edges
Days: [1, 2]


In [9]:
def get_top_n_data(df, n):
    df['normalized_edge'] = df.apply(lambda row: tuple(sorted([row['unique_from'], row['unique_to']])), axis=1)
    edge_avg_ks = df.groupby('normalized_edge')['ks_dist'].mean().reset_index(name='avg_ks_dist')
    top_edges = edge_avg_ks.nsmallest(n, 'avg_ks_dist')['normalized_edge'].tolist()
    filtered_df = df[df['normalized_edge'].isin(top_edges)].reset_index(drop=True)
    return top_edges, filtered_df

total_edges = 10000
top_n_edges, top_n_df = get_top_n_data(similarity_data, total_edges)
top_n_nodes = {node for edge in top_n_edges for node in edge}
top_n_df.to_csv(f'data/preprocessed/averaged_ks_scores/top_{total_edges}.csv', index=False)

print(f'Edge Count in Top {total_edges}: {top_n_df.shape[0]} edges')
print(f'Node Count in Top {total_edges}: {len(top_n_nodes)} nodes')

Edge Count in Top 10000: 11440 edges
Node Count in Top 10000: 2881 nodes


In [10]:
def get_complete_n_data(df, valid_nodes):
    mask = df.apply(lambda x: x['unique_from'] in valid_nodes and x['unique_to'] in valid_nodes, axis=1)
    filtered_df = df[mask].reset_index(drop=True)
    return filtered_df

complete_n_df = get_complete_n_data(similarity_data, top_n_nodes)
complete_n_df.to_csv(f'data/preprocessed/averaged_ks_scores/complete_graph_top_{total_edges}.csv', index=False)

print(f'Complete Edge Count in Top {total_edges}: {complete_n_df.shape[0]} edges')

Complete Edge Count in Top 10000: 4309796 edges


In [11]:
def get_negative_samples(df, top_edges):
    valid_edges = {frozenset(edge) for edge in top_edges}
    mask = ~df['normalized_edge'].apply(frozenset).isin(valid_edges)
    filtered_df = df[mask].reset_index(drop=True)
    return filtered_df

negative_samples = get_negative_samples(complete_n_df, top_n_edges)
negative_samples.to_csv(f'data/preprocessed/averaged_ks_scores/negative_top_{total_edges}.csv', index=False)

print(f'Negative Edge Count in Top {total_edges}: {negative_samples.shape[0]} edges')

Negative Edge Count in Top 10000: 4298356 edges
