In [1]:
import json
import numpy as np
import pandas as pd
import tqdm
import sklearn
import wandb
from sklearn.model_selection import KFold

import torch
from torch.optim import Adam

from src.scrapped_data_decoders.multi_decoder import MultiDecoder
from src.preprocessing.user_featurizer import UserFeaturizer
from src.preprocessing.edge_builder import EdgeBuilder
from src.preprocessing.community_subscription_matrix_builder import CommunitySubscriptionMatrixBuilder
from src.metrics import Metrics, compute_score
from src.losses import Loss
from src.gnn import GNN, ConvType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("mongo_dump.json") as file:
    data = json.load(file)

In [3]:
user_id_to_ordered_id = {
    user['_id']: i 
    for i, user in enumerate(data['communities'])
}

## Build user features

In [4]:
user_info = list(filter(lambda user: user['_id'] in user_id_to_ordered_id, data['users']))

In [5]:
decoder = MultiDecoder()
user_preferences = [decoder.decode(user) for user in user_info]
user_features = UserFeaturizer().build_feature_matrix(user_preferences, user_id_to_ordered_id)

## Build edges between users

In [6]:
edge_builder = EdgeBuilder(user_id_to_ordered_id)
edges = edge_builder.build(data['topology'])

100%|██████████| 116641/116641 [00:01<00:00, 81018.14it/s]


# Build target matrix

In [None]:
matrix_builder = CommunitySubscriptionMatrixBuilder()
matrix = matrix_builder.build(data['communities'], user_id_to_ordered_id)

100%|██████████| 92153/92153 [00:01<00:00, 51932.78it/s]
  2%|▏         | 2036/92153 [00:00<00:09, 9599.75it/s] 

# Training

In [None]:
def train_step(nn: GNN, graph: torch.Tensor, edge_index: torch.Tensor, target: torch.Tensor, train_indeces: np.ndarray, optimizer, compute_loss: Loss) -> float:
    y_pred = nn(graph, edge_index)
    train_loss = compute_loss(target[train_indeces], y_pred[train_indeces])
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    return train_loss.item()

In [None]:
def train(config: dict, graph, edge_index, y_true, train_indeces, test_indeces):
    nn = GNN(
        input_dim=user_features.numerical.shape[1],
        output_dim=matrix.shape[-1],
        graph_conv=ConvType.from_string(config['conv_type']),
        embeddings=user_features.categorical.max(axis=0) + 1,
        **config['backbone_config'],
    )
    score_type = Metrics.from_string(config["metric"])
    compute_loss = Loss(score_type)
    optimizer = Adam(nn.parameters())
    for _ in range(config['epochs']):
        # train
        train_loss = train_step(nn, graph, edge_index, y_true, train_indeces, optimizer, compute_loss)
        # evaluate
        with torch.no_grad():
            y_pred = nn(graph, edge_index)
            test_loss = compute_loss(y_true[test_indeces], y_pred[test_indeces])
        train_score = compute_score(y_true[train_indeces], y_pred[train_indeces].detach(), score_type)
        test_score = compute_score(y_true[test_indeces], y_pred[test_indeces].detach(), score_type)
        wandb.log({
            "loss/train": train_loss,
            "loss/test": test_loss,
            f"{config['metric']}/train": train_score,
            f"{config['metric']}/test": test_score,
        })

# Experiments

In [None]:
def build_experiment_name(config: dict):
    return f"{config['conv_type']}_" \
           f"depth{config['backbone_config']['depth']}" \
           f"_dim{config['backbone_config']['hidden_dim']}" 

In [None]:
def run_experiment(graph, edge_index, y_true, config: dict):
    kfold = KFold(3, random_state=42, shuffle=True)
    for train_indeces, test_indeces in kfold.split(range(len(y_true))):
        run = wandb.init(
            project="thesis",
            group=build_experiment_name(config),
            config=config,
        )
        train(config, graph, edge_index, y_true, train_indeces, test_indeces)
        wandb.finish()

In [None]:
from sklearn.model_selection import ParameterGrid
class ConfigGenerator:
    def __init__(self):
        self.param_grid = ParameterGrid(
            {
                "hidden_dim": [64, 256],
                "depth": [1, 2],
                "conv_type": ["gcn", "gat", "res", "gin"],
                "metric": ["balanced_accuracy", "map_at_k"],
            }
        )
    
    def generate(self):
        for pure_config in self.param_grid:
            config = {
                "metric": pure_config["metric"],
                "backbone_config": {
                    "hidden_dim": pure_config["hidden_dim"],
                    "depth": pure_config["depth"],
                    "mlp_depth": 2,
                },
                "conv_type": pure_config["conv_type"],
                "epochs": 100,
            }
            yield config

In [None]:
def main():
    graph = user_features
    edge_index = torch.tensor(edges)
    y_true = torch.tensor(matrix, dtype=torch.float32)
    config_generator = ConfigGenerator()
    for config in config_generator.generate():
        print(config)
        run_experiment(graph, edge_index, y_true, config)

In [None]:
import os
os.environ["WANDB_SILENT"] = "true"

In [None]:
main()