In [None]:
import pandas as pd
import requests
import json
import yaml
import glob
import os

config_path = os.path.join("..", "config.yaml")
with open(config_path, "rt") as config_file:
	config = yaml.safe_load(config_file)

In [None]:
full_df = pd.read_csv('./predictions/full_data.csv')
edges_dir = os.path.join("..", "FOS_Benchmark", "_".join(config["DOMAINS"]), "edges")
edges = pd.read_csv(os.path.join(edges_dir, "all_edges.csv"), header=None, names=["src", "dst", "ts"])
test_start = edges["ts"].quantile(0.85)

# Step 1: Split the DataFrame into pre-test_start and test_start-onward
pre_test_start = full_df[full_df['ts'] < test_start]
from_test_start = full_df[full_df['ts'] >= test_start]

# Step 2: Identify unique edges (src, dst pairs)
# Create a tuple of (src, dst) for easier comparison
full_df['edge'] = full_df[['src', 'dst']].apply(tuple, axis=1)
pre_test_start_edges = set(pre_test_start[['src', 'dst']].apply(tuple, axis=1))
from_test_start_edges = set(from_test_start[['src', 'dst']].apply(tuple, axis=1))

# Step 3: Find edges that are in test_start onward but not in pre-test_start
new_edges = from_test_start_edges - pre_test_start_edges

from_test_start['edge'] = from_test_start[['src', 'dst']].apply(tuple, axis=1)

# Step 4: Filter the DataFrame to keep only rows from test_start onward with new edges
result_df = from_test_start[from_test_start['edge'].isin(new_edges)]

# Step 5: Drop the temporary 'edge' column if you don't need it
result_df = result_df.drop(columns=['edge'])

# The resulting DataFrame contains only edges from test_start onward that didn't exist before test_start

In [None]:
result_edges = set(result_df[['src', 'dst']].apply(tuple, axis=1))

# List to store filtered DataFrames
filtered_dfs = []

file_pattern = r"predictions/*_prediction_output.csv.csv"
csv_files = glob.glob(file_pattern)  # Get list of CSV files

# Process each CSV file
for file in csv_files:
    # Read the CSV file
    df = pd.read_csv(file)
    # Create a temporary column for (src, dst) tuples
    df['edge'] = df[['src_node_id', 'dst_node_id']].apply(tuple, axis=1)
    # Filter rows where the edge is in result_edges
    filtered_df = df[df['edge'].isin(result_edges)]
       
    # Drop the temporary edge column
    filtered_df = filtered_df.drop(columns=['edge'])
    # Append to the list of filtered DataFrames
    filtered_dfs.append(filtered_df)
    
# Combine all filtered DataFrames into a single DataFrame
final_df = pd.concat(filtered_dfs, ignore_index=True) if filtered_dfs else pd.DataFrame()

# final_df now contains all rows from the CSV files where (src, dst) pairs are in result_df

In [None]:
threshold = pd.read_csv("predictons/threshold.csv").values.flatten()[0]
final_df = final_df[final_df['predict'] > threshold]

In [None]:
final_df

In [None]:
idx2id = pd.read_csv("node_id_mapping.csv").to_dict()
final_df["src_node_id"] = final_df["src_node_id"].map(idx2id['node_id'])
final_df["dst_node_id"] = final_df["dst_node_id"].map(idx2id['node_id'])

In [None]:
final_df

In [None]:
field_nodes = {}
for field in config["DOMAINS"]:
	field_nodes[field] = set(pd.read_csv(f"../OpenAlex_Knowledge_Graph/nodes/{field}.csv").values.flatten())
nodes = set.union(*list(field_nodes.values()))

def domain(node):
    for field, nodes_field in field_nodes.items():
        if node in nodes_field:
            return field

edge_list = []
for _, row in final_df.iterrows():
    index1 = row["src_node_id"]
    index2 = row["dst_node_id"]
    if index1 in nodes and index2 in nodes:
        if domain(index1) != domain(index2):
            edge_list.append({"source": index1, "destination": index2, "pred": row["predict"]})
    else:
        print("warning", index1, index2)

new_df = pd.DataFrame(edge_list)

In [None]:
new_df

In [None]:
# sort by pred (ascending)
new_df = new_df.sort_values(by="pred", ascending=False)
new_df

In [None]:
source = new_df['source'].values
destination = new_df['destination'].values

In [None]:
url = "https://api.openalex.org/works"
all_results = []  # To store all results
counts = []       # To store meta counts

for i in range(len(source)):
    src = source[i]
    dst = destination[i]

    params = {
        "filter": f"concepts.id:{src},concepts.id:{dst}"
    }

    try:
        # Send a GET request to the OpenAlex API
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raises HTTPError for bad status codes

        data = response.json()  # Parse JSON response

        # Append results to all_results
        all_results.extend(data.get('results', []))
        # Append count to counts list
        counts.append(data.get('meta', {}).get('count', 0))

        print(f"Request {i+1}: {data.get('meta', {}).get('count', 0)} results")

    except requests.exceptions.RequestException as e:
        print(f"Request {i+1} failed: {e}")
    except ValueError as e:
        print(f"Failed to parse JSON for request {i+1}: {e}")
    except KeyError as e:
        print(f"Missing expected data in response for request {i+1}: {e}")

# Save all results into a JSON file
with open(f"{'_'.join(config["DOMAINS"])}_OpenAlex_Results.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=4)

print(f"All results have been saved to '{'_'.join(config["DOMAINS"])}_OpenAlex_Results.json'.")

In [None]:
new_df['counts'] = counts
new_df

In [None]:
new_df.to_csv(f"{'_'.join(config["DOMAINS"])}_discussion.csv")

In [None]:
len(all_results)