In [1]:
import os
import csv
from time import time
from itertools import product
import networkx as nx
import numpy as np
import pandas as pd

In [2]:
mooc_csv = os.path.join("..", "data", "mooc.csv")
mooc_dir = os.path.join("..", "data", "mooc")
os.makedirs(mooc_dir, exist_ok=True)

In [3]:
df = pd.read_csv(mooc_csv, header=None, skiprows=1)
num_cols = len(df.columns)
edge_entities = ["user_idx", "item_idx", "timestamp", "user_label"]
num_feats = num_cols - len(edge_entities)  # Exclude user_id, item_id, timestamp and user label
df.columns = edge_entities + ["feat_{}".format(i) for i in range(num_feats)]
df["timestamp"] = df["timestamp"].astype(int)
df.sort_values(by="timestamp", inplace=True)
df.head(10)

Unnamed: 0,user_idx,item_idx,timestamp,user_label,feat_0,feat_1,feat_2,feat_3
0,0,0,0,0,-0.319991,-0.435701,0.106784,-0.067309
1,0,1,6,0,-0.319991,-0.435701,0.106784,-0.067309
2,0,2,41,0,-0.319991,-0.435701,0.106784,-0.067309
3,0,1,49,0,-0.319991,-0.435701,0.106784,-0.067309
4,0,2,51,0,-0.319991,-0.435701,0.106784,-0.067309
5,0,3,55,0,-0.319991,-0.435701,0.106784,-0.067309
6,0,4,59,0,-0.319991,-0.435701,0.106784,-0.067309
7,0,5,62,0,-0.319991,-0.435701,0.106784,-0.067309
8,0,6,65,0,-0.319991,-0.435701,0.106784,-0.067309
9,0,7,113,0,-0.319991,-0.435701,1.108826,12.777235


In [4]:
user_idx = sorted(df["user_idx"].unique())
item_idx = sorted(df["item_idx"].unique())
timestamps = set(df["timestamp"])

print("{} Users: {} - {}".format(len(user_idx), min(user_idx), max(user_idx)))
print("{} Items: {} - {}".format(len(item_idx), min(item_idx), max(item_idx)))
print("{} Timestamps: {} - {}".format(len(timestamps), min(timestamps), max(timestamps)))

7047 Users: 0 - 7046
97 Items: 0 - 96
345600 Timestamps: 0 - 2572086


In [5]:
def get_user_id(idx):
    return "u{}".format(idx)

def get_item_id(idx):
    return "i{}".format(idx)

def second_to_days(sec):
    return sec // (60*60*24)

user_ids = [get_user_id(idx) for idx in user_idx]
item_ids = [get_item_id(idx) for idx in item_idx]

df["user_id"] = df["user_idx"].map(get_user_id)
df["item_id"] = df["item_idx"].map(get_item_id)
df["days"] = df["timestamp"].map(second_to_days)
df.head()

Unnamed: 0,user_idx,item_idx,timestamp,user_label,feat_0,feat_1,feat_2,feat_3,user_id,item_id,days
0,0,0,0,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i0,0
1,0,1,6,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i1,0
2,0,2,41,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i2,0
3,0,1,49,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i1,0
4,0,2,51,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i2,0


In [6]:
orig_g = nx.Graph()  # Original bipartite graph
line_g = nx.Graph()  # Line graph of orig_g

orig_g.add_nodes_from(user_ids + item_ids)

all_possible_edges = list(product(user_ids, item_ids))  # All node pair list
edge2idx = {e: i for i, e in enumerate(all_possible_edges)}  # Edge (node pair) -> index

line_g.add_nodes_from(all_possible_edges)  # Add all bipartite graph edges as line graph nodes
nx.set_node_attributes(line_g, 0, "exists")  # Existence labels

In [7]:
day0_df = df[df["days"] == 0]
day0_df.shape

(4953, 11)

In [8]:
edge_feats = np.zeros((len(all_possible_edges), num_feats))  # All edge features (node features of line graph)

In [10]:
for _, row in day0_df.iterrows():
    user, item = row["user_id"], row["item_id"]
    edge_id = (user, item)
    edge_idx = edge2idx[edge_id]
    for i in range(num_feats):
        edge_feats[edge_idx, i] = row["feat_{}".format(i)]  # Set edge features
    
    if orig_g.degree(user) > 0:  # This user is already connected
        for i in orig_g.neighbors(user):
            line_g.add_edge((user, item), (user, i))
    if orig_g.degree(item) > 0:  # This item is already connected
        for u in orig_g.neighbors(item):
            line_g.add_edge((user, item), (u, item))
    orig_g.add_edge(user, item, timestamp=0)
    line_g.nodes[(user, item)]["exists"] = 1

In [11]:
num_possible_edges = len(all_possible_edges)
num_existing_edges = orig_g.number_of_edges()
print("Number of bipartite graph edges: {} / {}".format(num_existing_edges, num_possible_edges))
print("Number of line graph edges:", line_g.number_of_edges())

Number of bipartite graph edges: 3195 / 683559
Number of line graph edges: 432863


In [19]:
def get_daily_graphs(start, end):
    orig_g = nx.Graph()  # Original bipartite graph
    line_g = nx.Graph()  # Line graph of orig_g (node ID: tuple of the original node IDs)
    idx_line_g = nx.Graph()  # Line graph of orig_g (node ID: int)
    
    orig_g.add_nodes_from(user_ids + item_ids)
    line_g.add_nodes_from(all_possible_edges)  # Add all bipartite graph edges as line graph nodes
    idx_line_g.add_nodes_from(range(len(all_possible_edges)))  # Node ID type is int
    nx.set_node_attributes(line_g, 0, "exists")  # Existence labels
    
    day_df = df[(start <= df["days"]) & (df["days"] <= end)]
    num_edges, _ = day_df.shape
    
    # All edge features (node features of line graph)
    edge_feats = np.zeros((len(all_possible_edges), num_feats))
    
    for _, row in day_df.iterrows():
        user, item, ts = row["user_id"], row["item_id"], row["days"]
        edge_id = (user, item)
        edge_idx = edge2idx[edge_id]
        for i in range(num_feats):
            edge_feats[edge_idx, i] = row["feat_{}".format(i)]  # Set edge features

        if orig_g.degree(user) > 0:  # This user is already connected
            for i in orig_g.neighbors(user):
                nb = (user, i)
                nb_idx = edge2idx[nb]
                line_g.add_edge(edge_id, nb)
                idx_line_g.add_edge(edge_idx, nb_idx)
        if orig_g.degree(item) > 0:  # This item is already connected
            for u in orig_g.neighbors(item):
                nb = (u, item)
                nb_idx = edge2idx[nb]
                line_g.add_edge(edge_id, nb)
                idx_line_g.add_edge(edge_idx, nb_idx)
        orig_g.add_edge(user, item, timestamp=ts)
        line_g.nodes[(user, item)]["exists"] = 1
    
    num_possible_edges = len(all_possible_edges)
    num_existing_edges = orig_g.number_of_edges()
    print("Number of bipartite graph edges: {} / {}".format(num_existing_edges, num_possible_edges))
    print("Number of line graph edges:", line_g.number_of_edges())
    
    edge_labels = [line_g.nodes[e]["exists"] for e in all_possible_edges]
    
    return orig_g, line_g, idx_line_g, edge_feats, edge_labels

In [20]:
%%time
d1_orig_g, d1_line_g, d1_idx_line_g, d1_edge_feats, d1_edge_labels = get_daily_graphs(1, 1)

Number of bipartite graph edges: 10027 / 683559
Number of line graph edges: 3371797
CPU times: user 31 s, sys: 1.04 s, total: 32 s
Wall time: 32.2 s


In [21]:
%%time
d2_orig_g, d2_line_g, d2_idx_line_g, d2_edge_feats, d2_edge_labels = get_daily_graphs(2, 2)

Number of bipartite graph edges: 8387 / 683559
Number of line graph edges: 2322958
CPU times: user 23 s, sys: 800 ms, total: 23.8 s
Wall time: 23.9 s


In [29]:
%%time
edge_df = nx.to_pandas_edgelist(d2_idx_line_g, "src", "dst")
edge_df.head()
edge_csv = os.path.join(mooc_dir, "edge_2.csv")
edge_df.to_csv(edge_csv, header=False, index=False)

CPU times: user 18.4 s, sys: 210 ms, total: 18.6 s
Wall time: 18.7 s


In [23]:
label_csv = os.path.join(mooc_dir, "label_2.csv")
with open(label_csv, "w") as wf:
    writer = csv.writer(wf)
    for label in d2_edge_labels:
        writer.writerow([label])

In [28]:
feat_csv = os.path.join(mooc_dir, "feat_2.csv")
#norm_values = d2_edge_feats / np.sum(d2_edge_feats, axis=-1, keepdims=True)  # Normalize
np.savetxt(feat_csv, d2_edge_feats, delimiter=",", fmt="%.17f")  # Same format as the original data

In [30]:
def write_snapshot_csv(out_dir, step, idx_line_g, labels, feats):
    edge_csv = os.path.join(out_dir, "edge_{}.csv".format(step))
    edge_df = nx.to_pandas_edgelist(idx_line_g, "src", "dst")
    edge_df.to_csv(edge_csv, header=False, index=False)
    
    label_csv = os.path.join(out_dir, "label_{}.csv".format(step))
    with open(label_csv, "w") as wf:
        writer = csv.writer(wf)
        for label in labels:
            writer.writerow([label])
    
    feat_csv = os.path.join(mooc_dir, "feat_{}.csv".format(step))
    np.savetxt(feat_csv, feats, delimiter=",", fmt="%.17f")  # Same format as the original data

In [34]:
%%time
num_days = df["days"].max() + 1

for d in range(num_days):
    st = time()
    d_orig_g, d_line_g, d_idx_line_g, d_feats, d_labels = get_daily_graphs(d, d)
    tm = time()
    write_snapshot_csv(mooc_dir, d, d_idx_line_g, d_labels, d_feats)
    ed = time()
    print("Day {}, {:.2f} {:.2f}[s]".format(d, tm-st, ed-tm))

Number of bipartite graph edges: 3195 / 683559
Number of line graph edges: 432863
Day 0, 5.84 5.95[s]
Number of bipartite graph edges: 10027 / 683559
Number of line graph edges: 3371797
Day 1, 35.46 35.14[s]
Number of bipartite graph edges: 8387 / 683559
Number of line graph edges: 2322958
Day 2, 23.54 24.37[s]
Number of bipartite graph edges: 7196 / 683559
Number of line graph edges: 1725676
Day 3, 16.94 21.01[s]
Number of bipartite graph edges: 6383 / 683559
Number of line graph edges: 1399622
Day 4, 14.58 13.26[s]
Number of bipartite graph edges: 5256 / 683559
Number of line graph edges: 976777
Day 5, 10.69 13.27[s]
Number of bipartite graph edges: 4335 / 683559
Number of line graph edges: 698536
Day 6, 8.64 8.34[s]
Number of bipartite graph edges: 5007 / 683559
Number of line graph edges: 682760
Day 7, 8.89 8.28[s]
Number of bipartite graph edges: 9525 / 683559
Number of line graph edges: 1750125
Day 8, 21.45 21.61[s]
Number of bipartite graph edges: 8950 / 683559
Number of line gr