In [1]:
import torch_geometric
import torch

In [2]:
edge_index = torch.tensor([
    list(range(10)),
    list(range(10, 20))
])

In [3]:
edge_index_list = []
for i in range(10):
    for j in range(10,20):
        edge_index_list.append([i,j])
edge_index = torch.as_tensor(edge_index_list)

In [4]:
data = torch_geometric.data.Data(edge_index=edge_index.T.contiguous())

In [5]:
data.edge_index.contiguous()

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,
          3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  5,  5,  5,
          5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,
          7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
          9,  9,  9,  9,  9,  9,  9,  9,  9,  9],
        [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15,
         16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13,
         14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 10, 11,
         12, 13, 14, 15, 16, 17, 18, 19, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
         10, 11, 12, 13, 14, 15, 16, 17, 18, 19]])

In [6]:
from torch_geometric.loader import NeighborLoader

In [7]:
train_loader = NeighborLoader(data, 
                            num_neighbors = [2,2], 
                            input_nodes=range(20),
                            directed=True,
                            replace=False,
                            batch_size=5)



In [8]:
d = next(iter(train_loader))

In [9]:
d

Data(edge_index=[2, 0], n_id=[5], e_id=[0], input_id=[5], batch_size=5)

In [10]:
d.edge_index

tensor([], size=(2, 0), dtype=torch.int64)

# Hetero

We have heterogenous graph where connnection encode"students" performing "tasks". We predict link labels (students outcomes on the taks) using GraphSAGE. 

We get an unexpected behavior from the NeighborLoader: we want to sample neighors with "students" as starting nodes; however, although we set input_nodes=('student', [0]  ) in the init of NeighborLoader, the neighborhood are sampled from the "tasks".  

In [65]:
import numpy as np
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

# creating edge indices as (student_id, task_id)
n_students = 10
n_tasks= 5
edge_index_list = []
for i in range(n_students):
    for j in range(n_tasks):
        edge_index_list.append([i,j])
edge_index = torch.as_tensor(edge_index_list).T


data  = HeteroData()
# Save node indices
data['student'].node_id = torch.arange(n_students)
data['task'].node_id = torch.arange(n_tasks)


# Add the edge indices
# Here we expect to pass direct connections from students to tasks 
data['student', "takes", "task"].edge_index = edge_index.contiguous()


# We use T.ToUndirected() to add the reverse edges from subject to students 
# in order to let GNN pass messages in both ways
data = T.ToUndirected()(data)
del data['task', 'rev_takes', 'student'].edge_attr  # Remove "reverse" label.

train_loader = NeighborLoader(data, 
                    num_neighbors = {key: [10] for key in data.edge_types}, 
                    input_nodes=('student', [0]  ), #data['student'].node_id),
                    directed=True,
                    replace=False,
                    batch_size=1)


d = next(iter(train_loader))
d

HeteroData(
  [1mstudent[0m={
    node_id=[1],
    n_id=[1],
    input_id=[1],
    batch_size=1
  },
  [1mtask[0m={
    node_id=[5],
    n_id=[5]
  },
  [1m(student, takes, task)[0m={
    edge_index=[2, 0],
    e_id=[0]
  },
  [1m(task, rev_takes, student)[0m={
    edge_index=[2, 5],
    e_id=[5]
  }
)

In [11]:
import numpy as np
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [28]:
n_students = 10
n_tasks= 5

In [29]:
edge_index_list = []
for i in range(n_students):
    for j in range(n_tasks):
        edge_index_list.append([i,j])
edge_index = torch.as_tensor(edge_index_list)

In [49]:
data  = HeteroData()

# Save node indices
data['student'].node_id = torch.arange(n_students)
data['code'].node_id = torch.arange(n_tasks)

# Add the node features
# there seems to be students with different mother tounge and gender in different occasions
data["student"].x= torch.eye(n_students)
data["code"].x = torch.eye(n_tasks)

# Add the edge indices
data['student', "takes", "code"].edge_index = edge_index.T.contiguous()

# # Add the edge label
# data['student', "takes", "code"].y = torch.from_numpy(np.array(y)).to(torch.long)

# We use T.ToUndirected() to add the reverse edges from subject to students 
# in order to let GNN pass messages in both ways
data = T.ToUndirected()(data)
del data['code', 'rev_takes', 'student'].edge_attr  # Remove "reverse" label.
# del data['code', 'rev_takes', 'student'].y  # Remove "reverse" label.

In [50]:
data['student', "takes", "code"].edge_index

tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4,
         4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
         9, 9],
        [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3,
         4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2,
         3, 4]])

In [62]:
train_loader = NeighborLoader(data, 
                                    num_neighbors = {key: [10,1] for key in data.edge_types}, 
                                    input_nodes=('student', [0]  ), #data['student'].node_id),
                                    directed=True,
                                    replace=False,
                                    batch_size=1)

In [59]:
d = next(iter(train_loader))
d

HeteroData(
  [1mstudent[0m={
    node_id=[10],
    x=[10, 10],
    n_id=[10]
  },
  [1mcode[0m={
    node_id=[4],
    x=[4, 5],
    n_id=[4],
    input_id=[1],
    batch_size=1
  },
  [1m(student, takes, code)[0m={
    edge_index=[2, 10],
    e_id=[10]
  },
  [1m(code, rev_takes, student)[0m={
    edge_index=[2, 10],
    e_id=[10]
  }
)

In [34]:
d

HeteroData(
  [1mstudent[0m={
    node_id=[1],
    x=[1, 10],
    n_id=[1],
    input_id=[1],
    batch_size=1
  },
  [1mcode[0m={
    node_id=[0],
    x=[0, 5],
    n_id=[0]
  },
  [1m(student, takes, code)[0m={
    edge_index=[2, 0],
    e_id=[0]
  }
)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/mindsteps_set_full.csv')

In [7]:
df['score'].value_counts()

score
1.0    22378592
0.0    11388916
Name: count, dtype: int64