In [67]:
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score
import numpy as np

In [138]:
small_train_df = pd.read_json('data/small_dataset_train.json')
large_train_df = pd.read_json('data/large_dataset_train.json')

In [135]:
large_train_embedding = torch.load('data/large_dataset_train_embedding.pt')
small_train_embedding = torch.load('data/small_dataset_train_embedding.pt')

In [137]:
test_df = pd.read_json('data/test.json')
test_embedding = torch.load('data/test_embedding.pt')

In [139]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    

def pad_tensor_sequence(sequence, max_length, embedding_dim, padding_value =1):
    if sequence.size(0)>max_length:
        sequence = sequence[:max_length,:] ## Take the first max _length vector
    padding = torch.full((max_length - sequence.size(0), embedding_dim), 0)
    
    
    padded_sequence = torch.cat((sequence, padding), dim=0)
    
    attn_mask = torch.tensor(sequence.size(0)*[0]+padding.size(0)*[padding_value],dtype=torch.float) ##1 if it is a pad token
    
    return padded_sequence,attn_mask


def data_collator_with_padding(batch, embedding_dim, padding_value=0,max_length=128):
   
    batch_data_attn_mask = [pad_tensor_sequence(item[0], max_length, embedding_dim, padding_value) for item in batch]
    batch_labels = [torch.nn.functional.one_hot(torch.tensor(item[1],dtype=torch.long),7) for item in batch]
   
    batch_data = [item[0] for item in batch_data_attn_mask]
    batch_attention_mask = torch.stack([item[1] for item in batch_data_attn_mask])
    
    batch_data_tensor = torch.stack(batch_data)
    #print(batch_labels)
    batch_labels_tensor = torch.stack(batch_labels)
    
    #batch_labels_tensor = torch.tensor(batch_labels)

    return batch_data_tensor, batch_labels_tensor.float(), batch_attention_mask
    

# Example usage

#data = [v for v in neg_post_embedding_dict.values()]
#data.extend([p for p in adhd_post_embedding_dict.values()])
#labels = [0]*len(neg_post_embedding_dict)
#labels.extend([1]*len(adhd_post_embedding_dict))


#dataloader = DataLoader(dataset, batch_size=16, collate_fn=lambda batch: data_collator_with_padding(batch, 768))

#for batch_data, batch_labels,batch_attention_mask in a:
    
    #pass#print("Batch data shape:", batch_data.shape)
   # print("Batch labels:", batch_labels)

def create_dataloader_from_post_embedding(post_embedding,df,batch_size):
    data = []
    labels = []
    for author,frame in df.groupby('author'):
        data.append(post_embedding[frame.index])
        labels.append(frame['label'].iloc[0])
        
    dataset = CustomDataset(data,labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=lambda batch: data_collator_with_padding(batch, 768))
    
    return dataloader
        
        
    

In [140]:
def compute_centroids(df,userembedder,embedding_matrix):
    author2label = {}
    for group,frame in df.groupby('author'):
        author2label[group] = frame.label.iloc[0]
   
    author2embedding = {}
    
    for user,frame in df.groupby('author'):
        
        user_post_embedding = embedding_matrix[frame.index] ## This is the current user's post embedding
        inp,attn = pad_tensor_sequence(user_post_embedding,128,768)
        with torch.no_grad():
            inp = inp.unsqueeze(0).to(device)
            attn = attn.unsqueeze(0).to(device)
            out = userembedder(inp,attn)
        
            author2embedding[user] = out.squeeze()
        
        
    label2embedding = {i:[] for i in author2label.values()}
    
    for author,label in author2label.items():
        emb = author2embedding[author]
        label2embedding[label].append(emb)
        
    centroids = {key:torch.mean(torch.stack(value),axis=0) for key,value in label2embedding.items()}
    return label2embedding


In [141]:
def evaluate(df,userembedder,embedding_matrix,centroid):
    centroid_matrix = torch.tensor(torch.stack([v for v in centroids.values()]))
    all_labels = []
    all_pred = []
    for user,frame in df.groupby('author'):
        user_post_embedding = embedding_matrix[frame.index] ## This is the current user's post embedding
        label = frame.label.iloc[0]
        all_labels.append(label)
        inp,attn = pad_tensor_sequence(user_post_embedding,128,768)
        with torch.no_grad():
            inp = inp.unsqueeze(0).to(device)
            attn = attn.unsqueeze(0).to(device)
            out = userembedder(inp,attn)
            
            score = torch.cosine_similarity(out,centroid_matrix)
            pred = torch.argmax(score)
            all_pred.append(pred)
            
    return torch.tensor(all_pred),torch.tensor(all_labels)

In [143]:
large_train_dataloader = create_dataloader_from_post_embedding(large_train_embedding,large_train_df,32)
small_train_dataloader =  create_dataloader_from_post_embedding(small_train_embedding,small_train_df,32)

In [146]:
test_dataloader = create_dataloader_from_post_embedding(test_embedding,small_test_df,32)

In [9]:
from torch import nn
import torch

class UserEmbedder(nn.Module):
    
    def __init__(self,n_layer=4):
        super().__init__()
        self.layers = nn.ModuleList([nn.MultiheadAttention(768, 6,batch_first=True) for _ in range(n_layer)])
        self.layer_norm = nn.ModuleList([nn.LayerNorm(768) for _ in range(n_layer)])
        
    def forward(self,x,key_padding_mask=None):
        residual = x
        for multihead_attention,layer_norm in zip(self.layers,self.layer_norm):
           
            x,_ = multihead_attention(x,x,x,key_padding_mask=key_padding_mask)
            x = residual+x
            x = layer_norm(x)
            residual = x
        x = torch.mean(x,axis=1)
        
        return x
    
class Classifier(nn.Module):
    
    def __init__(self,n_layer=4,userembedder=None):
        super().__init__()
        if userembedder:
            self.userembedder = userembedder
        else:
            self.userembedder = UserEmbedder(4)
        self.fc = nn.Linear(768,64)
        self.dropout = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(64,7)
        
    def forward(self,x,src_mask=None):
        x = self.userembedder(x,key_padding_mask=src_mask)
        x = self.fc(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [10]:
import torch
import torch.nn as nn

from math import log
class SupConLoss(nn.Module):
    def __init__(self, temperature=0.07):
        """
        Implementation of the loss described in the paper Supervised Contrastive Learning :
        https://arxiv.org/abs/2004.11362
        :param temperature: int
        """
        super(SupConLoss, self).__init__()
        self.temperature = temperature
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, projections, targets):
        """
        :param projections: torch.Tensor, shape [batch_size, projection_dim]
        :param targets: torch.Tensor, shape [batch_size]
        :return: torch.Tensor, scalar
        """
        device = torch.device("cuda") if projections.is_cuda else torch.device("cpu")

        # dot_product_tempered = torch.mm(projections, projections.T) / self.temperature
        dot_product_tempered = self.cos(projections.unsqueeze(1), projections.unsqueeze(0)) / self.temperature
        # Minus max for numerical stability with exponential. Same done in cross entropy. Epsilon added to avoid log(0)
        exp_dot_tempered = (
            torch.exp(dot_product_tempered - torch.max(dot_product_tempered, dim=1, keepdim=True)[0]) + 1e-5
        )

        mask_similar_class = (targets.unsqueeze(1).repeat(1, targets.shape[0]) == targets).to(device)
        mask_anchor_out = (1 - torch.eye(exp_dot_tempered.shape[0])).to(device)
        mask_combined = mask_similar_class * mask_anchor_out
        cardinality_per_samples = torch.sum(mask_combined, dim=1)
        ## to avoid nan value of the loss if there is only one sample of a category on the batch
        for i in range(cardinality_per_samples.size(0)):
            if cardinality_per_samples[i]==0:
                cardinality_per_samples[i] = 1

        log_prob = -torch.log(exp_dot_tempered / (torch.sum(exp_dot_tempered * mask_anchor_out, dim=1, keepdim=True)))
        supervised_contrastive_loss_per_sample = torch.sum(log_prob * mask_combined, dim=1) / cardinality_per_samples
        supervised_contrastive_loss = torch.mean(supervised_contrastive_loss_per_sample)

        return supervised_contrastive_loss


In [11]:
def evaluate(model,val_dataloader,device):
    
    print("----- Evaluating ------")
    
    model.eval()
   
    all_predictions = torch.tensor([])
    all_labels = torch.tensor([])
   
    model = model.to(device)
    
    with torch.no_grad(): ## Disable gradient
         for inputs, labels,attn_mask in tqdm(val_dataloader):

            #inputs,labels = batch
            #print(inputs,attn_mask)
            inputs = inputs.to(device)
            labels = labels.to(device)
            attn_mask = attn_mask.to(device)
            
            logits = model(inputs,attn_mask)
            
            
            
            max_index = torch.argmax(logits,axis=-1).cpu()
            pred = torch.nn.functional.one_hot(max_index,7)
            
            labels = labels.cpu()
            all_predictions = torch.cat((all_predictions,pred),axis=0)
            all_labels = torch.cat((all_labels,labels),axis=0)

    class_indices = torch.argmax(all_labels,axis=1)
    prediction_indices = torch.argmax(all_predictions,axis=1)
    f1 = f1_score(all_predictions,all_labels,average=None)
    acc = torch.sum(class_indices==prediction_indices)/len(prediction_indices)
    print(f1)
    print("ACC",acc)
    return {"pred":all_predictions,"labels":all_labels}

In [14]:
model = Classifier(4)
#model = Naive()
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
loss_fn = nn.CrossEntropyLoss()
device = 'cuda:1'
loss_fn.to(device)
for i in tqdm(range(7)):
    model.train()
    for inp,label,attn in train_dataloader:
        
        optimizer.zero_grad()

        inp,label,attn = inp.to(device),label.to(device),attn.to(device)
        model = model.to(device)
        logits = model(inp,attn)
        loss = loss_fn(logits,label)
        loss.backward()
        
        optimizer.step()
    evaluate(model,test_dataloader,'cuda:1')
    #print(loss.item())

  0%|                                                                                                                                                                                 | 0/7 [00:00<?, ?it/s]

----- Evaluating ------



  1 if key_padding_mask is not None else 0 if attn_mask is not None else None)

 16%|███████████████████████████                                                                                                                                             | 5/31 [00:00<00:00, 45.13it/s][A
 35%|███████████████████████████████████████████████████████████▎                                                                                                           | 11/31 [00:00<00:00, 49.99it/s][A
 55%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                           | 17/31 [00:00<00:00, 51.66it/s][A
 74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 23/31 [00:00<00:00, 52.25it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████

[0.14285714 0.11049724 0.23003195 0.         0.4159132  0.
 0.65700483]
ACC tensor(0.3226)


 14%|████████████████████████▏                                                                                                                                                | 1/7 [00:09<00:58,  9.68s/it]


KeyboardInterrupt: 

In [None]:
evaluate(model,test_dataloader,"cuda:1")