In [1]:
import json
import pandas as pd
import tqdm
import sklearn

from src.scrapped_data_decoders.multi_decoder import MultiDecoder
from src.preprocessing.user_featurizer import UserFeaturizer
from src.preprocessing.edge_builder import EdgeBuilder
from src.preprocessing.community_subscription_matrix_builder import CommunitySubscriptionMatrixBuilder
from src.metrics import Metrics, compute_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("mongo_dump.json") as file:
    data = json.load(file)

In [3]:
user_id_to_ordered_id = {
    user['_id']: i 
    for i, user in enumerate(data['communities'])
}

## Build user features

In [4]:
user_info = list(filter(lambda user: user['_id'] in user_id_to_ordered_id, data['users']))

In [5]:
decoder = MultiDecoder()
user_preferences = [decoder.decode(user) for user in user_info]
user_features = UserFeaturizer().build_feature_matrix(user_preferences, user_id_to_ordered_id)

## Build edges between users

In [6]:
edge_builder = EdgeBuilder(user_id_to_ordered_id)
edges = edge_builder.build(data['topology'])

100%|██████████| 116641/116641 [00:01<00:00, 83603.93it/s]


In [7]:
edges.shape

(2, 770052)

In [8]:
matrix_builder = CommunitySubscriptionMatrixBuilder()
matrix = matrix_builder.build(data['communities'], user_id_to_ordered_id)

100%|██████████| 92153/92153 [00:01<00:00, 54253.90it/s]
100%|██████████| 92153/92153 [00:04<00:00, 19579.94it/s]


## GNN

In [9]:
from enum import Enum
class ConvType(Enum):
    gcn = 1
    gat = 2
    res = 3
    gin = 4

In [10]:
import torch
from torch.nn import ReLU, Dropout, Embedding
from torch_geometric.nn import MLP, Sequential, BatchNorm
from torch_geometric.nn.conv import GCNConv, ResGatedGraphConv, GATConv, GINConv

class NN(torch.nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_dim: int,
        output_dim: int,
        graph_conv: ConvType,
        depth: int,
        mlp_depth: int,
        embeddings: list[int],
        use_batchnorm: bool=False,
        dropout: float=0,
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dims = [4, 4, 2, 4, 4, 4, 4]
        self.use_batchnorm = False
        self.dropout = dropout
        self.input_mlp = MLP([input_dim + sum(self.embedding_dims)] + [hidden_dim] * mlp_depth, plain_last=False)
        self.conv_layers = Sequential(
            "x, edge_index",
            [
                self._build_conv_layer(graph_conv)
                for _ in range(depth)
            ]
        )
        self.output_mlp = MLP([hidden_dim] * mlp_depth + [output_dim])
        self.sigmoid = torch.nn.Sigmoid()
        self._build_embeddings(embeddings)
        
    def _build_conv_layer(self, conv_type: ConvType):
        if conv_type == ConvType.gcn:
            conv = (
                GCNConv(self.hidden_dim, self.hidden_dim),
                'x, edge_index -> x',
            )
        elif conv_type == ConvType.gat:
            conv = (
                GATConv(self.hidden_dim, self.hidden_dim),
                'x, edge_index -> x',
            )
        elif conv_type == ConvType.res:
            conv = (
                ResGatedGraphConv(self.hidden_dim, self.hidden_dim),
                'x, edge_index -> x',
            )
        elif conv_type == ConvType.gin:
            conv = (
                GINConv(MLP([self.hidden_dim, self.hidden_dim], train_eps=True)),
                'x, edge_index -> x',
            )
        else:
            raise NotImplementedError()
        conv_layer = [conv]
        if self.use_batchnorm:
            conv_layer.append((BatchNorm(self.hidden_dim), 'x -> x'))
        conv_layer.append(ReLU(inplace=True))
        if self.dropout is not None:
            conv_layer.append((Dropout(p=self.dropout), 'x -> x'))
        conv_layer = Sequential("x, edge_index", conv_layer)
        return conv
    
    def _build_embeddings(self, embedding_sizes):
        self.city_embedding = Embedding(embedding_sizes[0], embedding_dim=self.embedding_dims[0], scale_grad_by_freq=True)
        self.country_embedding = Embedding(embedding_sizes[1], embedding_dim=self.embedding_dims[1], scale_grad_by_freq=True)
        self.sex_embedding = Embedding(embedding_sizes[2], embedding_dim=self.embedding_dims[2], scale_grad_by_freq=True)
        self.politics_embedding = Embedding(embedding_sizes[3], embedding_dim=self.embedding_dims[3], scale_grad_by_freq=True)
        self.life_embedding = Embedding(embedding_sizes[4], embedding_dim=self.embedding_dims[4], scale_grad_by_freq=True)
        self.people_embedding = Embedding(embedding_sizes[5], embedding_dim=self.embedding_dims[5], scale_grad_by_freq=True)
        self.alcohol_embedding = Embedding(embedding_sizes[6], embedding_dim=self.embedding_dims[6], scale_grad_by_freq=True)
        self.embeddings = [
            self.city_embedding,
            self.country_embedding,
            self.sex_embedding,
            self.politics_embedding,
            self.life_embedding,
            self.people_embedding,
            self.alcohol_embedding,
        ]
    
    def forward(self, features, edge_index):
        numerical_features = torch.Tensor(features.numerical)
        categorical_embeddings = self._extract_embeddings(torch.tensor(features.categorical, dtype=torch.long))
        x = torch.cat([numerical_features, categorical_embeddings], dim=-1)
        x = self.input_mlp(x)
        x = self.conv_layers(x, edge_index)
        x = self.output_mlp(x)
        probs = self.sigmoid(x)
        return probs
    
    def _extract_embeddings(self, categorical_features) -> torch.Tensor:
        feature_embeds = []
        for feature_idx in range(categorical_features.shape[1]):
            feature = categorical_features[:, feature_idx]
            feature_embed = self.embeddings[feature_idx](feature)
            feature_embeds.append(feature_embed)
        feature_embeds = torch.cat(feature_embeds, dim=-1)
        return feature_embeds

# Training

In [59]:
nn = NN(
    input_dim=user_features.numerical.shape[1],
    hidden_dim=200,
    output_dim=matrix.shape[-1],
    graph_conv=ConvType.res,
    depth=1,
    mlp_depth=2,
    embeddings=user_features.categorical.max(axis=0) + 1,
)

In [60]:
edge_index = torch.tensor(edges)

In [61]:
positive_percent = matrix.mean()
positive_percent

0.012672150300767926

In [62]:
y_true = torch.tensor(matrix, dtype=torch.float)

In [63]:
from sklearn.model_selection import train_test_split
train_indeces, test_indeces = train_test_split(list(range(len(matrix))))

In [64]:
def compute_loss(y_true: torch.Tensor, y_pred: torch.Tensor, loss_type: Metrics):
    elementwise_loss = torch.nn.functional.binary_cross_entropy(y_pred, y_true, reduction='none')
    assert len(elementwise_loss.shape) == 2
    with torch.no_grad():
        if loss_type == Metrics.columnwise_balanced_accuracy:
            positive_to_negative_ratio = y_true.mean(axis=0) / (1 - y_true.mean(axis=0))
            positive_coefficients = (y_true > 0.5) / positive_to_negative_ratio
            negative_coefficients = (y_true < 0.5) #* positive_to_negative_ratio
            coefficients = positive_coefficients + negative_coefficients
        elif loss_type == Metrics.balanced_accuracy:
            positive_to_negative_ratio = y_true.mean() / (1 - y_true.mean())
            positive_coefficients = (y_true > 0.5) / positive_to_negative_ratio
            negative_coefficients = (y_true < 0.5) #* positive_to_negative_ratio
            coefficients = positive_coefficients + negative_coefficients
        else:
            raise NotImplementedError()
    balanced_loss = elementwise_loss * coefficients
    loss = balanced_loss.mean()
    return loss

In [76]:
test_indeces[2]

11497

In [65]:
from torch.optim import Adam
N_EPOCHS = 1000
score_type = Metrics.columnwise_balanced_accuracy
optimizer = Adam(nn.parameters())
# compute_loss = torch.nn.BCELoss(reduction='none')
for _ in range(N_EPOCHS):
    # train
    y_pred = nn(user_features, edge_index)
    train_loss = compute_loss(y_true[train_indeces], y_pred[train_indeces], score_type)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    score = compute_score(y_true[train_indeces], y_pred[train_indeces].detach(), score_type)
    print("Train score", score)
    print("Train loss", train_loss.detach().item())
    
    # test
    with torch.no_grad():
        y_pred = nn(user_features, edge_index)
        # unbalanced_loss = compute_loss(y_pred, y_true)
        # positive_coefs = (y_true > 0.5) / positive_percent
        # negative_coefs = (y_true < 0.5) 
        # coefs = positive_coefs + negative_coefs
        # balanced_loss = (unbalanced_loss * coefs)
        # test_loss = (balanced_loss[test_indeces]).mean()
        score = compute_score(y_true[test_indeces], y_pred[test_indeces].detach(), score_type)
    print("Test score", score)
    # print("Test loss", test_loss.detach().item())
    print("----")

Train score 0.501526799599685
Train loss 1.4030890464782715
Test score 0.5199558577332363
----
Train score 0.5215136423844995
Train loss 1.3790507316589355
Test score 0.5329579669976603
----
Train score 0.5342341102111383
Train loss 1.367807388305664
Test score 0.5443956082090738
----
Train score 0.5464865288803594
Train loss 1.359178900718689
Test score 0.5539553381564494
----
Train score 0.5559397856929721
Train loss 1.3519244194030762
Test score 0.5641842037742234
----
Train score 0.5657480849076028
Train loss 1.3451318740844727
Test score 0.5737884837129332
----
Train score 0.5747792326819688
Train loss 1.3384828567504883
Test score 0.582034007834216
----
Train score 0.5833871803853435
Train loss 1.3317434787750244
Test score 0.5901898660369068
----
Train score 0.5909873549458116
Train loss 1.325021505355835
Test score 0.596957442777514
----
Train score 0.5983454434794915
Train loss 1.3175126314163208
Test score 0.6044243046849676
----
Train score 0.6064505060514446
Train loss 1.30

KeyboardInterrupt: 

In [None]:
y_pred.detach().numpy().reshape(-1).max()