In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
SAVE_DIR = '/content/drive/MyDrive/BigData/Data/Model_Data'

In [None]:
import pandas as pd
from torch_geometric.data import HeteroData
from collections import defaultdict
import torch

In [None]:
kg_df = pd.read_csv("/content/drive/MyDrive/BigData/Data/Model_Data/kg_final.txt", sep='\t', header=None, names=['h', 'r', 't'])
kg_df.head()

Unnamed: 0,h,r,t
0,1518,0,2855
1,580,0,2854
2,1782,0,2837
3,2401,0,2847
4,636,0,2828


In [None]:
interaction_df = pd.read_csv("/content/drive/MyDrive/BigData/Data/Model_Data/mapped_user_course.csv")
interaction_df.head()

Unnamed: 0,user,course,timestamp
0,27868,884,2020-02-06 10:31:52
1,70491,123,2020-03-02 23:41:20
2,33794,674,2019-11-14 19:19:44
3,33794,901,2020-07-17 10:22:11
4,22923,910,2020-03-27 18:47:47


In [None]:
data = HeteroData()
edge_dict = defaultdict(list)

relation_mapping = {
    0: 'has_field',
    1: 'has_concept',
    2: 'taught_by',
    3: 'belongs_to_school',
    4: 'has_topic',
}

for _, row in kg_df.iterrows():
    course_id = int(row['h'])
    entity_id = int(row['t'])
    relation = int(row['r'])

    if relation == 0:
        entity_type = 'field'
    elif relation == 1:
        entity_type = 'concept'
    elif relation == 2:
        entity_type = 'teacher'
    elif relation == 3:
        entity_type = 'school'
    else:
        entity_type = 'topic'

    if entity_id not in entity_id_maps[entity_type]:
        entity_id_maps[entity_type][entity_id] = entity_id_counters[entity_type]
        entity_id_counters[entity_type] += 1

    normalized_entity_id = entity_id_maps[entity_type][entity_id]
    edge_type = ('course', relation_mapping[relation], entity_type)
    edge_dict[edge_type].append((course_id, entity_id))

for _, row in interaction_df.iterrows():
    user_id = int(row['user'])
    course_id = int(row['course'])

    edge_type = ('course', 'registered_by', 'user')
    edge_dict[edge_type].append((course_id, user_id))

for edge_type, edges in edge_dict.items():
    src, dst = zip(*edges)
    data[edge_type].edge_index = torch.tensor([src, dst], dtype=torch.long)

In [None]:
data

HeteroData(
  (course, has_field, field)={ edge_index=[2, 472] },
  (course, has_concept, concept)={ edge_index=[2, 63303] },
  (course, taught_by, teacher)={ edge_index=[2, 273] },
  (course, belongs_to_school, school)={ edge_index=[2, 2319] },
  (course, has_topic, topic)={ edge_index=[2, 2807] },
  (course, registered_by, user)={ edge_index=[2, 1996390] }
)

In [None]:
from torch_geometric.nn import MetaPath2Vec
from torch_geometric.transforms import ToUndirected

metapaths = [
    ('course', 'registered_by', 'user'),
    ('user', 'rev_registered_by', 'course'),
    ('course', 'has_concept', 'concept'),
    ('concept', 'rev_has_concept', 'course'),
    ('course', 'has_field', 'field'),
    ('field', 'rev_has_field', 'course'),
]

transform = ToUndirected()
data = transform(data)

model = MetaPath2Vec(data.edge_index_dict, embedding_dim=256, metapath=metapaths, walk_length=20,
                     context_size=5, walks_per_node=10, num_negative_samples=5,
                     sparse=True)

In [None]:
from tqdm import tqdm
import os

loader = model.loader(batch_size=256, shuffle=True, num_workers=2)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

best_loss = float('inf')
patience = 5
wait = 0

def train():
    model.train()
    total_loss = 0
    with tqdm(total=len(loader), desc='Training') as pbar:
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw, neg_rw)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pbar.update(1)
    return total_loss / len(loader)

for epoch in range(1, 51):
    loss = train()
    if loss < best_loss:
        best_loss = loss
        wait = 0
        torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'best_metapath2vec.pt'))  # lưu lại checkpoint tốt nhất
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
    print(f'Epoch: {epoch}, Loss: {loss:.4f}')

Training: 100%|██████████| 12/12 [01:35<00:00,  7.99s/it]


Epoch: 1, Loss: 9.2351


Training: 100%|██████████| 12/12 [01:25<00:00,  7.12s/it]


Epoch: 2, Loss: 8.5967


Training: 100%|██████████| 12/12 [01:26<00:00,  7.17s/it]


Epoch: 3, Loss: 8.2123


Training: 100%|██████████| 12/12 [01:25<00:00,  7.13s/it]


Epoch: 4, Loss: 7.8611


Training: 100%|██████████| 12/12 [01:26<00:00,  7.17s/it]


Epoch: 5, Loss: 7.5256


Training: 100%|██████████| 12/12 [01:26<00:00,  7.23s/it]


Epoch: 6, Loss: 7.2323


Training: 100%|██████████| 12/12 [01:25<00:00,  7.14s/it]


Epoch: 7, Loss: 6.9214


Training: 100%|██████████| 12/12 [01:33<00:00,  7.80s/it]


Epoch: 8, Loss: 6.6114


Training: 100%|██████████| 12/12 [01:27<00:00,  7.31s/it]


Epoch: 9, Loss: 6.2944


Training: 100%|██████████| 12/12 [01:26<00:00,  7.22s/it]


Epoch: 10, Loss: 5.9832


Training: 100%|██████████| 12/12 [01:25<00:00,  7.16s/it]


Epoch: 11, Loss: 5.6826


Training: 100%|██████████| 12/12 [01:26<00:00,  7.23s/it]


Epoch: 12, Loss: 5.3450


Training: 100%|██████████| 12/12 [01:26<00:00,  7.25s/it]


Epoch: 13, Loss: 5.0647


Training: 100%|██████████| 12/12 [01:25<00:00,  7.15s/it]


Epoch: 14, Loss: 4.7545


Training: 100%|██████████| 12/12 [01:33<00:00,  7.77s/it]


Epoch: 15, Loss: 4.4713


Training: 100%|██████████| 12/12 [01:26<00:00,  7.19s/it]


Epoch: 16, Loss: 4.2075


Training: 100%|██████████| 12/12 [01:26<00:00,  7.22s/it]


Epoch: 17, Loss: 3.9415


Training: 100%|██████████| 12/12 [01:27<00:00,  7.31s/it]


Epoch: 18, Loss: 3.6991


Training: 100%|██████████| 12/12 [01:28<00:00,  7.34s/it]


Epoch: 19, Loss: 3.4685


Training: 100%|██████████| 12/12 [01:32<00:00,  7.71s/it]


Epoch: 20, Loss: 3.2468


Training: 100%|██████████| 12/12 [01:34<00:00,  7.91s/it]


Epoch: 21, Loss: 3.0473


Training: 100%|██████████| 12/12 [01:27<00:00,  7.30s/it]


Epoch: 22, Loss: 2.8612


Training: 100%|██████████| 12/12 [01:26<00:00,  7.18s/it]


Epoch: 23, Loss: 2.6849


Training: 100%|██████████| 12/12 [01:26<00:00,  7.22s/it]


Epoch: 24, Loss: 2.5254


Training: 100%|██████████| 12/12 [01:27<00:00,  7.31s/it]


Epoch: 25, Loss: 2.3717


Training: 100%|██████████| 12/12 [01:27<00:00,  7.26s/it]


Epoch: 26, Loss: 2.2357


Training: 100%|██████████| 12/12 [01:25<00:00,  7.16s/it]


Epoch: 27, Loss: 2.1086


Training: 100%|██████████| 12/12 [01:33<00:00,  7.79s/it]


Epoch: 28, Loss: 1.9943


Training: 100%|██████████| 12/12 [01:26<00:00,  7.18s/it]


Epoch: 29, Loss: 1.8908


Training: 100%|██████████| 12/12 [01:27<00:00,  7.30s/it]


Epoch: 30, Loss: 1.8014


Training: 100%|██████████| 12/12 [01:26<00:00,  7.21s/it]


Epoch: 31, Loss: 1.7183


Training: 100%|██████████| 12/12 [01:26<00:00,  7.21s/it]


Epoch: 32, Loss: 1.6430


Training: 100%|██████████| 12/12 [01:27<00:00,  7.26s/it]


Epoch: 33, Loss: 1.5724


Training: 100%|██████████| 12/12 [01:26<00:00,  7.25s/it]


Epoch: 34, Loss: 1.5178


Training: 100%|██████████| 12/12 [01:34<00:00,  7.87s/it]


Epoch: 35, Loss: 1.4603


Training: 100%|██████████| 12/12 [01:27<00:00,  7.28s/it]


Epoch: 36, Loss: 1.4079


Training: 100%|██████████| 12/12 [01:25<00:00,  7.16s/it]


Epoch: 37, Loss: 1.3655


Training: 100%|██████████| 12/12 [01:26<00:00,  7.22s/it]


Epoch: 38, Loss: 1.3216


Training: 100%|██████████| 12/12 [01:27<00:00,  7.30s/it]


Epoch: 39, Loss: 1.2846


Training: 100%|██████████| 12/12 [01:26<00:00,  7.21s/it]


Epoch: 40, Loss: 1.2487


Training: 100%|██████████| 12/12 [01:26<00:00,  7.19s/it]


Epoch: 41, Loss: 1.2133


Training: 100%|██████████| 12/12 [01:34<00:00,  7.86s/it]


Epoch: 42, Loss: 1.1850


Training: 100%|██████████| 12/12 [01:26<00:00,  7.18s/it]


Epoch: 43, Loss: 1.1567


Training: 100%|██████████| 12/12 [01:27<00:00,  7.32s/it]


Epoch: 44, Loss: 1.1292


Training: 100%|██████████| 12/12 [01:26<00:00,  7.22s/it]


Epoch: 45, Loss: 1.1055


Training: 100%|██████████| 12/12 [01:25<00:00,  7.14s/it]


Epoch: 46, Loss: 1.0786


Training: 100%|██████████| 12/12 [01:27<00:00,  7.28s/it]


Epoch: 47, Loss: 1.0580


Training: 100%|██████████| 12/12 [01:26<00:00,  7.24s/it]


Epoch: 48, Loss: 1.0337


Training: 100%|██████████| 12/12 [01:33<00:00,  7.81s/it]


Epoch: 49, Loss: 1.0164


Training: 100%|██████████| 12/12 [01:27<00:00,  7.27s/it]


Epoch: 50, Loss: 0.9928


In [None]:
model.eval()
course_emb = model('course')

embed_dim = course_emb.size(1)

padding_emb = torch.zeros(1, embed_dim)

mask_emb = torch.mean(course_emb, dim=0, keepdim=True)

full_course_emb = torch.cat([padding_emb, course_emb, mask_emb], dim=0)

torch.save(full_course_emb, os.path.join(SAVE_DIR, 'course_embeddings.pt'))