In [1]:
import os
import csv
from time import time
from itertools import product
import networkx as nx
import numpy as np
import pandas as pd

In [2]:
mooc_csv = os.path.join("..", "data", "mooc.csv")
# with open(mooc_csv, "r") as rf:
#     reader = csv.reader(rf)
#     next(reader)
#     for idx, row in enumerate(reader):
#         print(row)
#         [user_id, item_id, ts, state] = row[:4]
#         edge_feats = row[4:]
#         print(edge_feats)
#         if idx > 3:
#             break

['0', '0', '0.0', '0', '-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['0', '1', '6.0', '0', '-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['0', '2', '41.0', '0', '-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['0', '1', '49.0', '0', '-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['0', '2', '51.0', '0', '-0.3199914794575269', '-0.4357014334929225', '0.10678377884769008', '-0.06730923976451772']
['-0.3199914794575269', '-0.4357014334929

In [3]:
df = pd.read_csv(mooc_csv, header=None, skiprows=1)
num_cols = len(df.columns)
edge_entities = ["user_idx", "item_idx", "timestamp", "user_label"]
num_feats = num_cols - len(edge_entities)  # Exclude user_id, item_id, timestamp and user label
df.columns = edge_entities + ["feat_{}".format(i) for i in range(num_feats)]
df["timestamp"] = df["timestamp"].astype(int)
df.sort_values(by="timestamp", inplace=True)
df.head(10)

Unnamed: 0,user_idx,item_idx,timestamp,user_label,feat_0,feat_1,feat_2,feat_3
0,0,0,0,0,-0.319991,-0.435701,0.106784,-0.067309
1,0,1,6,0,-0.319991,-0.435701,0.106784,-0.067309
2,0,2,41,0,-0.319991,-0.435701,0.106784,-0.067309
3,0,1,49,0,-0.319991,-0.435701,0.106784,-0.067309
4,0,2,51,0,-0.319991,-0.435701,0.106784,-0.067309
5,0,3,55,0,-0.319991,-0.435701,0.106784,-0.067309
6,0,4,59,0,-0.319991,-0.435701,0.106784,-0.067309
7,0,5,62,0,-0.319991,-0.435701,0.106784,-0.067309
8,0,6,65,0,-0.319991,-0.435701,0.106784,-0.067309
9,0,7,113,0,-0.319991,-0.435701,1.108826,12.777235


In [15]:
user_idx = sorted(df["user_idx"].unique())
item_idx = sorted(df["item_idx"].unique())
timestamps = set(df["timestamp"])

print("{} Users: {} - {}".format(len(user_idx), min(user_idx), max(user_idx)))
print("{} Items: {} - {}".format(len(item_idx), min(item_idx), max(item_idx)))
print("{} Timestamps: {} - {}".format(len(timestamps), min(timestamps), max(timestamps)))

7047 Users: 0 - 7046
97 Items: 0 - 96
345600 Timestamps: 0 - 2572086


In [16]:
def get_user_id(idx):
    return "u{}".format(idx)

def get_item_id(idx):
    return "i{}".format(idx)

def second_to_days(sec):
    return sec // (60*60*24)

user_ids = [get_user_id(idx) for idx in user_idx]
item_ids = [get_item_id(idx) for idx in item_idx]

df["user_id"] = df["user_idx"].map(get_user_id)
df["item_id"] = df["item_idx"].map(get_item_id)
df["days"] = df["timestamp"].map(second_to_days)
df.head()

Unnamed: 0,user_idx,item_idx,timestamp,user_label,feat_0,feat_1,feat_2,feat_3,user_id,item_id,days
0,0,0,0,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i0,0
1,0,1,6,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i1,0
2,0,2,41,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i2,0
3,0,1,49,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i1,0
4,0,2,51,0,-0.319991,-0.435701,0.106784,-0.067309,u0,i2,0


In [21]:
orig_g = nx.Graph()  # Original bipartite graph
line_g = nx.Graph()  # Line graph of orig_g

orig_g.add_nodes_from(user_ids + item_ids)

all_possible_edges = list(product(user_ids, item_ids))  # All node pair list
edge2idx = {e: i for i, e in enumerate(all_possible_edges)}  # Edge (node pair) -> index

line_g.add_nodes_from(all_possible_edges)  # Add all bipartite graph edges as line graph nodes
nx.set_node_attributes(line_g, 0, "exists")  # Existence labels

In [20]:
day0_df = df[df["days"] == 0]
day0_df.shape

(4953, 11)

In [23]:
edge_feats = np.zeros((len(all_possible_edges), num_feats))  # All edge features (node features of line graph)

In [27]:
for _, row in day0_df.iterrows():
    user, item = row["user_id"], row["item_id"]
    edge_id = (user, item)
    edge_idx = edge2idx[edge_id]
    for i in range(num_feats):
        edge_feats[edge_idx, i] = row["feat_{}".format(i)]  # Set edge features
    
    if orig_g.degree(user) > 0:  # This user is already connected
        for i in orig_g.neighbors(user):
            line_g.add_edge((user, item), (user, i))
    if orig_g.degree(item) > 0:  # This item is already connected
        for u in orig_g.neighbors(item):
            line_g.add_edge((user, item), (u, item))
    orig_g.add_edge(user, item, timestamp=ts)
    line_g.nodes[(user, item)]["exists"] = 1

In [32]:
num_possible_edges = len(all_possible_edges)
num_existing_edges = orig_g.number_of_edges()
print("Number of bipartite graph edges: {} / {}".format(num_existing_edges, num_possible_edges))
print("Number of line graph edges:", line_g.number_of_edges())

Number of bipartite graph edges: 3195 / 683559
Number of line graph edges: 432863


In [37]:
def get_daily_graphs(start, end):
    orig_g = nx.Graph()  # Original bipartite graph
    line_g = nx.Graph()  # Line graph of orig_g
    orig_g.add_nodes_from(user_ids + item_ids)
    line_g.add_nodes_from(all_possible_edges)  # Add all bipartite graph edges as line graph nodes
    nx.set_node_attributes(line_g, 0, "exists")  # Existence labels
    
    day_df = df[(start <= df["days"]) & (df["days"] <= end)]
    num_edges, _ = day_df.shape
    
    # All edge features (node features of line graph)
    edge_feats = np.zeros((len(all_possible_edges), num_feats))
    
    for _, row in day_df.iterrows():
        user, item = row["user_id"], row["item_id"]
        edge_id = (user, item)
        edge_idx = edge2idx[edge_id]
        for i in range(num_feats):
            edge_feats[edge_idx, i] = row["feat_{}".format(i)]  # Set edge features

        if orig_g.degree(user) > 0:  # This user is already connected
            for i in orig_g.neighbors(user):
                line_g.add_edge((user, item), (user, i))
        if orig_g.degree(item) > 0:  # This item is already connected
            for u in orig_g.neighbors(item):
                line_g.add_edge((user, item), (u, item))
        orig_g.add_edge(user, item, timestamp=ts)
        line_g.nodes[(user, item)]["exists"] = 1
    
    num_possible_edges = len(all_possible_edges)
    num_existing_edges = orig_g.number_of_edges()
    print("Number of bipartite graph edges: {} / {}".format(num_existing_edges, num_possible_edges))
    print("Number of line graph edges:", line_g.number_of_edges())
    
    edge_labels = [line_g.nodes[e]["exists"] for e in all_possible_edges]
    
    return orig_g, line_g, edge_feats, edge_labels

In [39]:
%%time
d1_orig_g, d1_line_g, d1_edge_feats, d1_edge_labels = get_daily_graphs(1, 1)

Number of bipartite graph edges: 10027 / 683559
Number of line graph edges: 3371797
CPU times: user 16.3 s, sys: 674 ms, total: 16.9 s
Wall time: 17 s


In [40]:
%%time
d2_orig_g, d2_line_g, d2_edge_feats, d2_edge_labels = get_daily_graphs(2, 2)

Number of bipartite graph edges: 8387 / 683559
Number of line graph edges: 2322958
CPU times: user 17.6 s, sys: 1.07 s, total: 18.7 s
Wall time: 19 s


In [42]:
for d in range(10):
    st = time()
    d_orig_g, d_line_g , d_feats, d_labels = get_daily_graphs(d, d)
    ed = time()
    print("Day {}, {:.2f}[s]".format(d, ed-st))

Number of bipartite graph edges: 3195 / 683559
Number of line graph edges: 432863
Day 0, 3.99[s]
Number of bipartite graph edges: 10027 / 683559
Number of line graph edges: 3371797
Day 1, 17.83[s]
Number of bipartite graph edges: 8387 / 683559
Number of line graph edges: 2322958
Day 2, 16.62[s]
Number of bipartite graph edges: 7196 / 683559
Number of line graph edges: 1725676
Day 3, 11.63[s]
Number of bipartite graph edges: 6383 / 683559
Number of line graph edges: 1399622
Day 4, 10.08[s]
Number of bipartite graph edges: 5256 / 683559
Number of line graph edges: 976777
Day 5, 7.37[s]
Number of bipartite graph edges: 4335 / 683559
Number of line graph edges: 698536
Day 6, 6.43[s]
Number of bipartite graph edges: 5007 / 683559
Number of line graph edges: 682760
Day 7, 5.85[s]
Number of bipartite graph edges: 9525 / 683559
Number of line graph edges: 1750125
Day 8, 13.29[s]
Number of bipartite graph edges: 8950 / 683559
Number of line graph edges: 1481080
Day 9, 10.45[s]
