In [3]:
#stl
import os
import warnings

#data handling
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import missingno as mso

#stats
import scipy
import sklearn

#network
import networkx as nx

#vis
import matplotlib.pyplot as plt
import seaborn as sns
import os.path as osp
import time
sns.set(font_scale = 1)
sns.set_style("whitegrid")

#os
import importlib.metadata
import json
import logging
import os
import re
import tempfile
import time
import ast
from pathlib import Path
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, TypeVar, Union

In [4]:
import torch

def donor_recipient_network_creation(subject: Optional[pd.DataFrame] = None,
                                     object: Optional[pd.DataFrame] = None,
                                     link_col: Optional[str] = "CASEID") -> nx.Graph:
    r"""
    Creates donor-recipient network. 

    ARGS: 
        subject: node 1 (feature matrix in pd.dataframe of initial node, node1 x feature)
        predicate: edge label
        object: node 2 (feature matrix in pd.dataframe of target node, node2 x feature)
        link_col: column that is shared between 

    RETURNS:
        nx.Graph of  network
    """ 
    print("------Constructing Donor-Recipient Network------")
    #make all the nodes into an index
    subject_index = [i for i in range(len(subject))]
    object_index = [i + (len(subject) - 1) for i in range(len(object))] #(subject starts at 0, so last index is len(subject) - 1
    subject["NODE_ID"] = subject_index
    object["NODE_ID"] = object_index

    #create edge index
    subject_index = []
    object_index = []

    #filter, so that lookup is < O(n^2)
    shared_links = list(set(subject[link_col].tolist()) & set(object[link_col].tolist()))
    subject_shared = subject[subject[link_col].isin(shared_links)].reset_index(drop=True)
    object_shared = object[object[link_col].isin(shared_links)].reset_index(drop=True)
    
    #match donor to recipient, iterate through donors
    for donor_i in tqdm(range(len(subject_shared)), desc = "Donor-Recipient Matching", bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}'):
        for recipient_i in range(len(object_shared)):
            caseid = subject_shared[link_col][donor_i]

            if object_shared[link_col][recipient_i] == caseid:
                #add edge to graph
                subject_index.append(subject_shared["NODE_ID"][donor_i])
                object_index.append(object_shared["NODE_ID"][recipient_i])

    edge_index = torch.LongTensor(np.array([subject_index, object_index]))
    edge_type = torch.LongTensor(np.array([0 for i in range(len(subject))]))
    edge_weight = torch.LongTensor(np.array([1 for i in range(len(subject))]))

    #stats
    print("Donor-Recipient Network Statistics:")
    print("\tnodes:", edge_index.shape[1] * 2)
    print("\tedges:", edge_index.shape[1])
    assert edge_index.shape[1] == len(shared_links)
    return edge_index, edge_weight, edge_type, subject, object


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/miniconda3/envs/pyg_CUDA/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/miniconda3/envs/pyg_CUDA/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/miniconda3/envs/pyg_CUDA/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start(

In [29]:
# Load in dataset
df = pd.read_csv("/Users/stevenswee/Desktop/BE M227/Processed_Data/encoded_recipient_donor_op_comp_df.csv")

In [30]:
# Get donor features
donor_cols = [col for col in df.columns if col.startswith("d_") or col.startswith("dd_")]
donor = df[donor_cols]

# Rename the columns by dropping the prefixes
donor.rename(
    columns={col: col.lstrip("d_").lstrip("d_") for col in donor_cols}, 
    inplace=True
)

# Find shared features between donor and recipient
shared_columns = donor.columns
shared_columns = [col for col in shared_columns if col in df.columns]

# Filter for shared features
recipient = df[shared_columns]
recipient["CASEID"] = df["CASEID"]
donor = donor[shared_columns]
donor["CASEID"] = df["CASEID"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  donor.rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipient["CASEID"] = df["CASEID"]


In [31]:
dr_edge_index, dr_edge_weight, dr_edge_type, subject, object = donor_recipient_network_creation(subject=donor, object = recipient)


------Constructing Donor-Recipient Network------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  object["NODE_ID"] = object_index
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for donor_i in tqdm(range(len(subject_shared)), desc = "Donor-Recipient Matching", bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}'):


Donor-Recipient Matching:   0%|          | 0/2354 [00:00<?, ?it/s]

Donor-Recipient Network Statistics:
	nodes: 4708
	edges: 2354
