In [1]:
import json
import numpy as np
import pandas as pd
import tqdm
import sklearn
import wandb
from sklearn.model_selection import KFold

import torch
from torch.optim import Adam

from src.scrapped_data_decoders.multi_decoder import MultiDecoder
from src.preprocessing.user_featurizer import UserFeaturizer
from src.preprocessing.edge_builder import EdgeBuilder
from src.preprocessing.community_subscription_matrix_builder import CommunitySubscriptionMatrixBuilder
from src.metrics import Metrics, compute_score
from src.losses import Loss
from src.gnn import GNN, ConvType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("mongo_dump.json") as file:
    data = json.load(file)

In [3]:
user_id_to_ordered_id = {
    user['_id']: i 
    for i, user in enumerate(data['communities'])
}

## Build user features

In [4]:
user_info = list(filter(lambda user: user['_id'] in user_id_to_ordered_id, data['users']))

In [5]:
decoder = MultiDecoder()
user_preferences = [decoder.decode(user) for user in user_info]
user_features = UserFeaturizer().build_feature_matrix(user_preferences, user_id_to_ordered_id)

## Build edges between users

In [6]:
edge_builder = EdgeBuilder(user_id_to_ordered_id)
edges = edge_builder.build(data['topology'])

100%|██████████| 116641/116641 [00:01<00:00, 82099.31it/s]


# Build target matrix

In [7]:
matrix_builder = CommunitySubscriptionMatrixBuilder()
matrix = matrix_builder.build(data['communities'], user_id_to_ordered_id)

100%|██████████| 92153/92153 [00:01<00:00, 52553.97it/s]
100%|██████████| 92153/92153 [00:04<00:00, 19519.49it/s]


# Training

In [8]:
def train_step(nn: GNN, graph: torch.Tensor, edge_index: torch.Tensor, target: torch.Tensor, train_indeces: np.ndarray, optimizer, compute_loss: Loss) -> float:
    y_pred = nn(graph, edge_index)
    train_loss = compute_loss(target[train_indeces], y_pred[train_indeces])
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    return train_loss.item()

In [9]:
def train(config: dict, graph, edge_index, y_true, train_indeces, test_indeces):
    nn = GNN(
        input_dim=user_features.numerical.shape[1],
        output_dim=matrix.shape[-1],
        graph_conv=ConvType.from_string(config['conv_type']),
        embeddings=user_features.categorical.max(axis=0) + 1,
        **config['backbone_config'],
    )
    score_type = Metrics.from_string(config["metric"])
    compute_loss = Loss(score_type)
    optimizer = Adam(nn.parameters())
    for _ in range(config['epochs']):
        # train
        train_loss = train_step(nn, graph, edge_index, y_true, train_indeces, optimizer, compute_loss)
        # evaluate
        with torch.no_grad():
            y_pred = nn(graph, edge_index)
            test_loss = compute_loss(y_true[test_indeces], y_pred[test_indeces])
        train_score = compute_score(y_true[train_indeces], y_pred[train_indeces].detach(), score_type)
        test_score = compute_score(y_true[test_indeces], y_pred[test_indeces].detach(), score_type)
        wandb.log({
            "loss/train": train_loss,
            "loss/test": test_loss,
            f"{config['metric']}/train": train_score,
            f"{config['metric']}/test": test_score,
        })

In [10]:
def build_experiment_name(config: dict):
    return f"{config['conv_type']}_" \
           f"depth{config['backbone_config']['depth']}" \
           f"_dim{config['backbone_config']['mlp_depth']}" 

In [11]:
def run_experiment(graph, edge_index, y_true, config: dict):
    kfold = KFold(3, random_state=42, shuffle=True)
    for train_indeces, test_indeces in kfold.split(range(len(y_true))):
        run = wandb.init(
            project="thesis",
            group=build_experiment_name(config),
            config=config,
        )
        train(config, graph, edge_index, y_true, train_indeces, test_indeces)
        wandb.finish()

In [12]:
from sklearn.model_selection import ParameterGrid
class ConfigGenerator:
    def __init__(self):
        self.param_grid = ParameterGrid(
            {
                "hidden_dim": [64, 256],
                "depth": [1, 2],
                "conv_type": ["gcn", "gat", "res", "gin"],
                "metric": ["balanced_accuracy", "map_at_k"],
            }
        )
    
    def generate(self):
        for pure_config in self.param_grid:
            config = {
                "metric": pure_config["metric"],
                "backbone_config": {
                    "hidden_dim": pure_config["hidden_dim"],
                    "depth": pure_config["depth"],
                    "mlp_depth": 2,
                },
                "conv_type": pure_config["conv_type"],
                "epochs": 5,
            }
            yield config

In [13]:
def main():
    graph = user_features
    edge_index = torch.tensor(edges)
    y_true = torch.tensor(matrix, dtype=torch.float32)
    config_generator = ConfigGenerator()
    for config in config_generator.generate():
        print(config)
        run_experiment(graph, edge_index, y_true, config)

In [14]:
main()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 64, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'gcn', 'epochs': 5}


[34m[1mwandb[0m: Currently logged in as: [33mimplausible_deniability[0m ([33myort[0m). Use [1m`wandb login --relogin`[0m to force relogin


0,1
loss/test,█▆▄▃▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36296
loss/train,1.36895
score/test,0.54243
score/train,0.54381


0,1
loss/test,█▆▄▃▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36594
loss/train,1.37001
score/test,0.5408
score/train,0.54347


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 128, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'gcn', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.34034
loss/train,1.34868
score/test,0.57048
score/train,0.57164


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.32724
loss/train,1.33543
score/test,0.58465
score/train,0.58725


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 256, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'gcn', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▄▆▇█
score/train,▁▄▆▇█

0,1
loss/test,1.28287
loss/train,1.29608
score/test,0.62409
score/train,0.6271


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▄▆▇█
score/train,▁▄▅▇█

0,1
loss/test,1.28163
loss/train,1.2943
score/test,0.62843
score/train,0.63162


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 64, 'depth': 2, 'mlp_depth': 2}, 'conv_type': 'gcn', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36455
loss/train,1.37088
score/test,0.5411
score/train,0.54249


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36873
loss/train,1.37368
score/test,0.53604
score/train,0.53645


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 128, 'depth': 2, 'mlp_depth': 2}, 'conv_type': 'gcn', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.3232
loss/train,1.33406
score/test,0.59112
score/train,0.59248


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.33582
loss/train,1.34257
score/test,0.57641
score/train,0.57966


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 256, 'depth': 2, 'mlp_depth': 2}, 'conv_type': 'gcn', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▄▅▇█
score/train,▁▄▅▇█

0,1
loss/test,1.27598
loss/train,1.29013
score/test,0.63403
score/train,0.63845


0,1
loss/test,█▆▄▂▁
loss/train,█▅▃▂▁
score/test,▁▄▆▇█
score/train,▁▄▆▇█

0,1
loss/test,1.28245
loss/train,1.29299
score/test,0.62609
score/train,0.62982


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 64, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'gat', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36434
loss/train,1.36966
score/test,0.54098
score/train,0.54229


0,1
loss/test,█▆▄▃▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36656
loss/train,1.36973
score/test,0.53942
score/train,0.54207


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 128, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'gat', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.33538
loss/train,1.34525
score/test,0.57665
score/train,0.57798


0,1
loss/test,█▆▄▃▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.33858
loss/train,1.34567
score/test,0.56844
score/train,0.57153


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 256, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'gat', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▃▂▁
score/test,▁▄▅▇█
score/train,▁▄▅▇█

0,1
loss/test,1.30526
loss/train,1.31677
score/test,0.60741
score/train,0.60935


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.29282
loss/train,1.3047
score/test,0.6194
score/train,0.62357


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 64, 'depth': 2, 'mlp_depth': 2}, 'conv_type': 'gat', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36542
loss/train,1.37199
score/test,0.54178
score/train,0.54284


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.3576
loss/train,1.3627
score/test,0.55193
score/train,0.55344


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 128, 'depth': 2, 'mlp_depth': 2}, 'conv_type': 'gat', 'epochs': 5}


0,1
loss/test,█▆▄▃▁
loss/train,█▅▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▇█

0,1
loss/test,1.33102
loss/train,1.34186
score/test,0.58344
score/train,0.58522


0,1
loss/test,█▆▄▂▁
loss/train,█▆▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.34509
loss/train,1.35067
score/test,0.56538
score/train,0.56817


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 256, 'depth': 2, 'mlp_depth': 2}, 'conv_type': 'gat', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.28674
loss/train,1.30167
score/test,0.62136
score/train,0.62333


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▄▆▇█
score/train,▁▄▆▇█

0,1
loss/test,1.28497
loss/train,1.29724
score/test,0.62535
score/train,0.62803


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 64, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'res', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▆█

0,1
loss/test,1.36021
loss/train,1.36377
score/test,0.54587
score/train,0.54838


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.36771
loss/train,1.37047
score/test,0.53859
score/train,0.53918


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 128, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'res', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.3505
loss/train,1.35729
score/test,0.56304
score/train,0.56419


0,1
loss/test,█▆▄▂▁
loss/train,█▅▄▂▁
score/test,▁▃▅▆█
score/train,▁▃▅▆█

0,1
loss/test,1.34774
loss/train,1.35222
score/test,0.56266
score/train,0.56563


{'metric': 'balanced_accuracy', 'backbone_config': {'hidden_dim': 256, 'depth': 1, 'mlp_depth': 2}, 'conv_type': 'res', 'epochs': 5}


0,1
loss/test,█▆▄▂▁
loss/train,█▅▃▂▁
score/test,▁▃▅▇█
score/train,▁▃▅▇█

0,1
loss/test,1.32036
loss/train,1.32777
score/test,0.60262
score/train,0.60554


KeyboardInterrupt: 