In [1]:
import json
import pandas as pd
import tqdm

from src.scrapped_data_decoders.multi_decoder import MultiDecoder
from src.preprocessing.user_featurizer import UserFeaturizer
from src.preprocessing.edge_builder import EdgeBuilder
from src.preprocessing.community_subscription_matrix_builder import CommunitySubscriptionMatrixBuilder

In [2]:
with open("mongo_dump.json") as file:
    data = json.load(file)

In [5]:
user_id_to_ordered_id = {
    user['_id']: i 
    for i, user in enumerate(data['communities'])
}

## Build user features

In [6]:
user_info = list(filter(lambda user: user['_id'] in user_id_to_ordered_id, data['users']))

In [7]:
decoder = MultiDecoder()
user_preferences = [decoder.decode(user) for user in user_info]
user_features = UserFeaturizer().build_feature_matrix(user_preferences, user_id_to_ordered_id)

## Build edges between users

In [8]:
edge_builder = EdgeBuilder(user_id_to_ordered_id)
edges = edge_builder.build(data['topology'])

100%|██████████| 116641/116641 [00:01<00:00, 76933.98it/s]


In [9]:
edges.shape

(2, 770052)

In [10]:
matrix_builder = CommunitySubscriptionMatrixBuilder()
matrix = matrix_builder.build(data['communities'], user_id_to_ordered_id)

100%|██████████| 92153/92153 [00:02<00:00, 44055.58it/s]
100%|██████████| 92153/92153 [00:04<00:00, 19600.10it/s]


## GNN

In [11]:
from enum import Enum
class ConvType(Enum):
    gcn = 1
    gat = 2
    res = 3
    gin = 4

In [12]:
import torch
from torch.nn import ReLU, Dropout, Embedding
from torch_geometric.nn import MLP, Sequential, BatchNorm
from torch_geometric.nn.conv import GCNConv, ResGatedGraphConv, GATConv, GINConv

class NN(torch.nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_dim: int,
        output_dim: int,
        graph_conv: ConvType,
        depth: int,
        mlp_depth: int,
        embeddings: list[int],
        use_batchnorm: bool=False,
        dropout: float=0,
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dims = [4, 4, 2, 4, 4, 4, 4]
        self.use_batchnorm = False
        self.dropout = dropout
        self.input_mlp = MLP([input_dim + sum(self.embedding_dims)] + [hidden_dim] * mlp_depth, plain_last=False)
        self.conv_layers = Sequential(
            "x, edge_index",
            [
                self._build_conv_layer(graph_conv)
                for _ in range(depth)
            ]
        )
        self.output_mlp = MLP([hidden_dim] * mlp_depth + [output_dim])
        self.sigmoid = torch.nn.Sigmoid()
        self._build_embeddings(embeddings)
        
    def _build_conv_layer(self, conv_type: ConvType):
        if conv_type == ConvType.gcn:
            conv = (
                GCNConv(self.hidden_dim, self.hidden_dim),
                'x, edge_index -> x',
            )
        elif conv_type == ConvType.gat:
            conv = (
                GATConv(self.hidden_dim, self.hidden_dim),
                'x, edge_index -> x',
            )
        elif conv_type == ConvType.res:
            conv = (
                ResGatedGraphConv(self.hidden_dim, self.hidden_dim),
                'x, edge_index -> x',
            )
        elif conv_type == ConvType.gin:
            conv = (
                GINConv(MLP([self.hidden_dim, self.hidden_dim], train_eps=True)),
                'x, edge_index -> x',
            )
        else:
            raise NotImplementedError()
        conv_layer = [conv]
        if self.use_batchnorm:
            conv_layer.append((BatchNorm(self.hidden_dim), 'x -> x'))
        conv_layer.append(ReLU(inplace=True))
        if self.dropout is not None:
            conv_layer.append((Dropout(p=self.dropout), 'x -> x'))
        conv_layer = Sequential("x, edge_index", conv_layer)
        return conv
    
    def _build_embeddings(self, embedding_sizes):
        self.city_embedding = Embedding(embedding_sizes[0], embedding_dim=self.embedding_dims[0], scale_grad_by_freq=True)
        self.country_embedding = Embedding(embedding_sizes[1], embedding_dim=self.embedding_dims[1], scale_grad_by_freq=True)
        self.sex_embedding = Embedding(embedding_sizes[2], embedding_dim=self.embedding_dims[2], scale_grad_by_freq=True)
        self.politics_embedding = Embedding(embedding_sizes[3], embedding_dim=self.embedding_dims[3], scale_grad_by_freq=True)
        self.life_embedding = Embedding(embedding_sizes[4], embedding_dim=self.embedding_dims[4], scale_grad_by_freq=True)
        self.people_embedding = Embedding(embedding_sizes[5], embedding_dim=self.embedding_dims[5], scale_grad_by_freq=True)
        self.alcohol_embedding = Embedding(embedding_sizes[6], embedding_dim=self.embedding_dims[6], scale_grad_by_freq=True)
        self.embeddings = [
            self.city_embedding,
            self.country_embedding,
            self.sex_embedding,
            self.politics_embedding,
            self.life_embedding,
            self.people_embedding,
            self.alcohol_embedding,
        ]
    
    def forward(self, features, edge_index):
        numerical_features = torch.Tensor(features.numerical)
        categorical_embeddings = self._extract_embeddings(torch.tensor(features.categorical, dtype=torch.long))
        x = torch.cat([numerical_features, categorical_embeddings], dim=-1)
        x = self.input_mlp(x)
        x = self.conv_layers(x, edge_index)
        x = self.output_mlp(x)
        probs = self.sigmoid(x)
        return probs
    
    def _extract_embeddings(self, categorical_features) -> torch.Tensor:
        feature_embeds = []
        for feature_idx in range(categorical_features.shape[1]):
            feature = categorical_features[:, feature_idx]
            feature_embed = self.embeddings[feature_idx](feature)
            feature_embeds.append(feature_embed)
        feature_embeds = torch.cat(feature_embeds, dim=-1)
        return feature_embeds

  from .autonotebook import tqdm as notebook_tqdm


# Training

In [45]:
nn = NN(
    input_dim=user_features.numerical.shape[1],
    hidden_dim=200,
    output_dim=matrix.shape[-1],
    graph_conv=ConvType.res,
    depth=1,
    mlp_depth=2,
    embeddings=user_features.categorical.max(axis=0) + 1,
)

In [46]:
edge_index = torch.tensor(edges)

In [47]:
nn(user_features, edge_index)
pass

In [48]:
positive_percent = matrix.mean()
positive_percent

0.012672150300767926

In [49]:
y_true = torch.tensor(matrix, dtype=torch.float)

In [None]:
from torch.optim import Adam
from sklearn.metrics import balanced_accuracy_score
N_EPOCHS = 1000
optimizer = Adam(nn.parameters())
compute_loss = torch.nn.BCELoss(reduction='none')
for _ in range(N_EPOCHS):
    y_pred = nn(user_features, edge_index)
    unbalanced_loss = compute_loss(y_pred, y_true)
    positive_coefs = (y_true > 0.5) / positive_percent
    negative_coefs = (y_true < 0.5) 
    coefs = positive_coefs + negative_coefs
    loss = (unbalanced_loss * coefs).mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    accuracy = balanced_accuracy_score(y_true.reshape(-1) > 0.5, y_pred.detach().numpy().reshape(-1) > 0.5)
    print("accuracy", accuracy)
    print("loss", loss.detach().item())
    print((y_pred.detach().numpy().reshape(-1) > 0.5).mean())
    print("----")

accuracy 0.4994098183543454
loss 1.4108797311782837
0.47874245367305823
----
accuracy 0.5307962969277773
loss 1.3812289237976074
0.4562474001569853
----
accuracy 0.5506338203779615
loss 1.365121603012085
0.4337192856806977
----
accuracy 0.568524231438401
loss 1.3525575399398804
0.4122101649792555
----
accuracy 0.5836480684469402
loss 1.3416506052017212
0.3957501835715242
----
accuracy 0.5965838661447225
loss 1.3319389820098877
0.37856419939303837
----
accuracy 0.6073804592053107
loss 1.322317361831665
0.3648659656585606
----
accuracy 0.6160670518350762
loss 1.312400221824646
0.354358042241345
----
accuracy 0.6228290274573207
loss 1.3024804592132568
0.345678165659284
----
accuracy 0.6288493901164498
loss 1.2928543090820312
0.34127433000915147
----
accuracy 0.6347701044641083
loss 1.2822420597076416
0.3385772573871713
----


In [16]:
y_pred.detach().numpy().reshape(-1).max()

NameError: name 'y_pred' is not defined