In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"]="false"
import torch.nn.functional as F
import torch.nn as nn
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import DataLoader, Dataset
import os
from dataclasses import dataclass
import numpy as np
import random
from tqdm import tqdm
import time
from datetime import timedelta
import pandas as pd
import errno
from torch.cuda.amp import autocast
import gc
import re
import pickle
import psutil
from collections import Counter, defaultdict

In [2]:
@dataclass
class Configuration:
    
    # Transformer
    transformer =  (
                        '/kaggle/input/transformer-offline-no-weights/LaBSE',
                        '/kaggle/input/transformer-offline-no-weights/mcontriever-msmarco',
                        '/kaggle/input/transformer-offline-no-weights/paraphrase-multilingual-mpnet-base-v2',
                        '/kaggle/input/transformer-offline-no-weights/stsb-xlm-r-multilingual',
                        '/kaggle/input/transformer-offline-no-weights/xlm-r-100langs-bert-base-nli-stsb-mean-tokens',
                    )
    
    # Weights
    checkpoints =  (
                        '/kaggle/input/labse-gpu/weights_end.pth',
                        '/kaggle/input/mcontriever-msmarco-gpu/weights_end.pth',
                        '/kaggle/input/mpnet-gpu/weights_end.pth',
                        '/kaggle/input/stsb-xlm-r-gpu/weights_end.pth',
                        '/kaggle/input/xlm-r-100langs-gpu/weights_end.pth'
                    )
    
    # Known content IDs
    content_ids = (
                       '/kaggle/input/labse-gpu/content_ids.pt', 
                       '/kaggle/input/mcontriever-msmarco-gpu/content_ids.pt',
                       '/kaggle/input/mpnet-gpu/content_ids.pt',
                       '/kaggle/input/stsb-xlm-r-gpu/content_ids.pt',
                       '/kaggle/input/xlm-r-100langs-gpu/content_ids.pt'
                  )
    
    # Known conten features
    content_features =  (
                         '/kaggle/input/labse-gpu/content_features.pt', 
                         '/kaggle/input/mcontriever-msmarco-gpu/content_features.pt',
                         '/kaggle/input/mpnet-gpu/content_features.pt',
                         '/kaggle/input/stsb-xlm-r-gpu/content_features.pt',
                         '/kaggle/input/xlm-r-100langs-gpu/content_features.pt'
                        ) 
    
    # Known content language
    content_language =  (
                         '/kaggle/input/labse-gpu/content_language.pt', 
                         '/kaggle/input/mcontriever-msmarco-gpu/content_language.pt',
                         '/kaggle/input/mpnet-gpu/content_language.pt',
                         '/kaggle/input/stsb-xlm-r-gpu/content_language.pt',
                         '/kaggle/input/xlm-r-100langs-gpu/content_language.pt'
                        )

    
    # Predict 
    max_len: int = 96             # max len of tokenized topic and content
    batch_size: int = 64          # batch size (keep small for max performance/speed)
    margin: float = 0.18          # dynamic threshold margin  
        
    # set num_workers
    num_workers: int = psutil.cpu_count(logical=False)  # CPU Cores
    
    # use GPU 
    device: str = 'cuda'
    gpu_ids: int = (0,)            # (0,1) if T4
            
    # Testing
    verbose: bool     = False      # show progress bar
    speed_test: bool  = False      # encode 10000 known contents for speed testing
    input_test: bool  = False      # check if all ids are aligned of known content
    output_test: bool = False      # use 5000 topics instead of sample submission and evaluate 
    

In [3]:
config = Configuration() 

In [4]:
class Net(nn.Module):
    def __init__(self,
                 transformer_name,
                 ):
        
        super().__init__()

        self.config = AutoConfig.from_pretrained(transformer_name)
        print(self.config)
        self.transformer = AutoModel.from_config(config=self.config)  
        
        self.logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
                
    @autocast()
    def forward(self, ids, mask): 

        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        pooled_output = sequence_output[:, 0, :]
            
        return pooled_output

In [5]:
class EqualDatasetEval(Dataset):
    
    def __init__(self,
                 text_list,
                 ids_list,
                 language_list,
                 tokenizer,
                 max_len):
        
        super().__init__()
        

        self.text_list = text_list
        self.ids_list = ids_list
        self.language_list = language_list
        
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id
        
        self.max_len = max_len
        
      
    def __getitem__(self, index):
        
        text_id = self.ids_list[index]
        language = self.language_list[index]
        
        tok = self.tokenizer.encode_plus(self.text_list[index],
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        return_token_type_ids=True,
                                        truncation=True,
                                        return_tensors='pt')
        
        return tok['input_ids'], tok['attention_mask'], text_id, language

    def __len__(self):
        return len(self.ids_list)
            

    def smart_batching_collate(self, batch):
        
        input_ids = [x[0] for x in batch]
        input_ids = torch.cat(input_ids, dim=0)
        
        mask = [x[1] for x in batch]
        mask = torch.cat(mask, dim=0)
        
        max_seq_length = mask.sum(-1).max().to(torch.long)
        
        # smart cutoff
        input_ids = input_ids[:, :max_seq_length]
        mask = mask[:, :max_seq_length]
        
        text_id =  [x[2] for x in batch]
        
        language = [x[3] for x in batch]
        
        return input_ids, mask, text_id, language

In [6]:
def predict(config, model, dataloader):
    
    model.eval()
    
    if config.verbose:
        bar = tqdm(dataloader, total=len(dataloader))
    else:
        bar = dataloader
        
    features_list = []
    ids_list = []
    language_list = []
    
    t0 = time.time()
    
    with torch.no_grad():
        
        for ids, mask, text_id, language_id in bar:
            
            ids_list.extend(text_id)
            language_list.extend(language_id)
        
            with autocast():

                ids = ids.to(config.device)
                mask = mask.to(config.device)
        
                feature = model(ids, mask)
                feature = F.normalize(feature, dim=-1)
            
            # normalize output is fp32 with autocast
            features_list.append(feature.to(torch.float16))

        features = torch.cat(features_list, dim=0).to("cpu")
        
    if config.verbose:
        bar.close()
        
    ids_list = np.array(ids_list)
    language_list = np.array(language_list)
    
    t1 = time.time()
    
    print(f"Time for feature extraction: {t1-t0:.3f} sec")    
        
       
    return features, ids_list, language_list

In [7]:
def clean(x):
    x = str(x)
    if x != "" and len(x) > 1:
        x = x.strip().strip('\t').strip('\n')
    return x
    
def clean_and_cut(x):
    x = str(x)
    if x != "" and len(x) > 1:
        x = x.strip().strip('\t').strip('\n').replace("", "")
        x = re.sub(r'http\S+', '', x)
        x = " ".join(x.split(" ")[:32])[:256]      
    return x

In [8]:
df_sample_submission = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv", index_col=0)
df_topics = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/topics.csv", index_col=0).fillna({"title": "", "description": ""})

topic2language = dict(zip(df_topics.index, df_topics["language"]))

df_sample_submission["language"] = df_sample_submission.index.map(lambda x : topic2language.get(x, "unk"))

df_topics = df_topics.replace(to_replace= r'\r\n', value= ' ', regex=True)
df_topics = df_topics.replace(to_replace= r'\n', value= ' ', regex=True)

df_topics["title"] = df_topics["title"].map(clean)
df_topics["description"] = df_topics["description"].map(clean)

In [9]:
class Topic:
    def __init__(self, topic_id):
        self.id = topic_id
        

    @property
    def parent(self):
        parent_id = df_topics.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        #return separator.join(reversed([a.title for a in ancestors]))
        return separator.join([a.title for a in ancestors])

    @property
    def children(self):
        return [Topic(child_id) for child_id in df_topics[df_topics.parent == self.id].index]


    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return df_topics.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"

In [10]:
if config.output_test:
    df_correlations = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv", index_col=0)
    topic_test = df_correlations.index.values[:5000]
    df_correlations["language"] = df_correlations.index.map(lambda x : topic2language.get(x, "unk"))
    language_t = df_correlations["language"].values[:5000].tolist()
    del df_correlations
else:
    topic_test = df_sample_submission.index.values 
    language_t = df_sample_submission["language"].values.tolist()
     
def get_topic2string(topic_test): 
    
    topic2string = {}

    if config.verbose:
        bar = tqdm(topic_test, total=len(topic_test))
    else:
        bar = topic_test

    for t in bar:
        to = Topic(t)
        string = "{} # {}".format(to.get_breadcrumbs(separator=" # ", include_self=True), to.description)
        topic2string[t] = string
        
    return topic2string


topic2string = get_topic2string(topic_test)

ids_t = topic_test.tolist()

text_t = []
for t in ids_t:
    text_t.append(topic2string[t])
    
    
del df_topics, df_sample_submission
gc.collect()

124

In [11]:
if config.input_test:
    
    # Check if all pre-extracted features have same order
    content_ids_0 = np.array(torch.load(config.content_ids[0]))
    content_language_0 = np.array(torch.load(config.content_language[0]))

    for i in range(1,len(config.transformer)):

        print("-"*30, i, "-"*30)

        content_ids = np.array(torch.load(config.content_ids[i]))
        content_language = np.array(torch.load(config.content_language[i]))
        content_features = torch.load(config.content_features[i])

        if (content_ids_0 == content_ids).sum() == len(content_ids):
            print("Sucess: Same order of Content and Content")
        else:
            print("Error: Content and Conten have not the same order!!!")

        if (content_language_0 == content_language).sum() == len(content_language_0):
            print("Sucess: Same order of Content Language and Content Language")
        else:
            print("Error: Content Language and Content Language have not the same order!!!")
            
    del content_ids_0, content_language_0, content_features, content_ids
    gc.collect()

In [12]:
df_content = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/content.csv", index_col=0).fillna({"title": "", "description": "", "text": ""})

content_ids = np.array(torch.load(config.content_ids[0]))
content_language = np.array(torch.load(config.content_language[0]))

# For speed Test on Content
if config.speed_test:
    known_content = set(content_ids[:-10000])
else:
    known_content = set(content_ids)
      
df_content["known"] = df_content.index.map(lambda x : x in known_content)

unknown_content_df = df_content[df_content["known"] == False].copy()
known_content_df = df_content[df_content["known"] == True]

print("Lenght all content:    ", len(df_content))
print("Lenght known content:  ", len(known_content_df))
print("Lenght unknown content:", len(unknown_content_df))

known_content_ids =  set(known_content_df.index)

known_content_selector = []

for i, c in enumerate(content_ids):

    if c in known_content_ids:
        known_content_selector.append(True)
    else:
        known_content_selector.append(False)
        
known_content_selector = np.array(known_content_selector)

Lenght all content:     154047
Lenght known content:   154047
Lenght unknown content: 0


In [13]:
# New Content
unknown_content_df = unknown_content_df.replace(to_replace= r'\r\n', value= ' ', regex=True)
unknown_content_df = unknown_content_df.replace(to_replace= r'\n', value= ' ', regex=True)

unknown_content_df["title"] = unknown_content_df["title"].map(clean)
unknown_content_df["description"] = unknown_content_df["description"].map(clean)
unknown_content_df["text"] = unknown_content_df["text"].map(clean_and_cut)

unknown_content_df["text_cut"] = unknown_content_df["text"].map(lambda x : " ".join(x.split(" ")[:32]))
unknown_content_df["input"] = unknown_content_df["title"] + " # " + unknown_content_df["description"] + " # " +  unknown_content_df["text_cut"]

text_c = unknown_content_df["input"].values.tolist()
ids_c = unknown_content_df.index.tolist()
language_c = unknown_content_df["language"].values.tolist()

print(f"Lenght of unknown content to process: {len(text_c)}")

del df_content, unknown_content_df, known_content_df
gc.collect()

Lenght of unknown content to process: 0


83

In [14]:
def extract_features_model(config, text_t, ids_t, language_t, text_c, ids_c, language_c, known_content_selector):
    
    
    for i, t in enumerate(config.transformer):

        print("\n{}[Model: {}]{}".format(20*"-", t, 20*"-"))

        model = Net(transformer_name=t).eval().to(torch.device(config.device))

        tokenizer = AutoTokenizer.from_pretrained(t)
    
        print("Loading Checkpoint:", config.checkpoints[i])
        model.load_state_dict(torch.load(config.checkpoints[i], map_location=torch.device(config.device)), strict=True)
        
        # Eval
        val_dataset_topic = EqualDatasetEval(text_list=text_t,
                                             ids_list=ids_t,
                                             language_list=language_t,
                                             tokenizer=tokenizer,
                                             max_len=config.max_len)

        val_loader_topic = DataLoader(dataset=val_dataset_topic, 
                                      batch_size=config.batch_size, 
                                      shuffle=False,
                                      num_workers=config.num_workers,
                                      pin_memory=True,
                                      collate_fn=val_dataset_topic.smart_batching_collate
                                      )
        
        
        topic_features, topic_ids, topic_language = predict(config, model, val_loader_topic)
        
        torch.save(topic_features, f"topic_features_{i}.pt")
        
        content_ids = np.array(torch.load(config.content_ids[i]))[known_content_selector]
        content_language = np.array(torch.load(config.content_language[i]))[known_content_selector]
        content_features = torch.load(config.content_features[i])[known_content_selector]
        
        # if new content update content
        if len(text_c) > 0:
        

            val_dataset_content = EqualDatasetEval(text_list=text_c,
                                                   ids_list=ids_c,
                                                   language_list=language_c,
                                                   tokenizer=tokenizer,
                                                   max_len=config.max_len)

            val_loader_content = DataLoader(dataset=val_dataset_content, 
                                            batch_size=config.batch_size, 
                                            shuffle=False,
                                            num_workers=config.num_workers,
                                            pin_memory=True,
                                            collate_fn=val_dataset_content.smart_batching_collate
                                            )

       
            content_features_new, content_ids_new, content_language_new = predict(config, model, val_loader_content) 
        
            # Add new content
            content_ids = np.concatenate([content_ids, content_ids_new])
            content_language = np.concatenate([content_language, content_language_new])
            content_features = torch.cat([content_features, content_features_new], dim=0)
        

        torch.save(content_features, f"content_features_{i}.pt")
                
        if i == 0:        
            # Save only once cause we checked that for all models the same order
            torch.save(topic_ids, f"topic_ids.pt")
            torch.save(topic_language, f"topic_language.pt")

            torch.save(content_ids, f"content_ids.pt")
            torch.save(content_language, f"content_language.pt")
    
        del model, content_features, topic_features
        torch.cuda.empty_cache()
        gc.collect()

In [15]:
extract_features_model(config, text_t, ids_t, language_t, text_c, ids_c, language_c, known_content_selector)
gc.collect()


--------------------[Model: /kaggle/input/transformer-offline-no-weights/LaBSE]--------------------
BertConfig {
  "_name_or_path": "/kaggle/input/transformer-offline-no-weights/LaBSE",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 501153
}

Loading Checkpoint: /kaggle/inpu

0

In [16]:
topic_ids =   torch.load("topic_ids.pt")
topic_language =  torch.load("topic_language.pt")

content_ids = torch.load("content_ids.pt")
content_language =  torch.load("content_language.pt")

language_count = Counter(topic_language).most_common()
language_list = [l for l, _ in language_count]

print("Languages:", language_list)

language2sim = dict()

for i in range(len(config.transformer)):
    
    topic_features = torch.load(f"topic_features_{i}.pt", map_location=torch.device("cuda"))
    content_features = torch.load(f"content_features_{i}.pt", map_location=torch.device("cuda"))

    print(f"\nCalculate Similiarity for: {config.transformer[i]}")

    for language in language_list:

        language_content_index = content_language==language
        language_topic_index = topic_language==language

        language_content_ids = content_ids[language_content_index]
        language_topic_ids = topic_ids[language_topic_index]

        language_content_features = content_features[language_content_index]
        language_topic_features = topic_features[language_topic_index] 

        if len(language_topic_features) > 0 and len(language_content_features) > 0:

            if language_topic_features.dim() == 1:
                language_topic_features = language_topic_features.unsqueeze(0)

            if language_content_features.dim() == 1:
                language_content_features = language_content_features.unsqueeze(0)

            with torch.no_grad():
                sim = language_topic_features @ language_content_features.T  

            old_sim = language2sim.get(language, None)

            if old_sim is None:
                language2sim[language] = sim   
            else:
                language2sim[language] += sim

                    
del topic_features, content_features     
gc.collect()
torch.cuda.empty_cache()
              

num_models = len(config.transformer)
    
topic_list = []
content_list = []

print(f"Calculate mean of similiarities of {num_models} models per language")

print(f"\nSelect content per language using dynamic threshold of {config.margin}:")

for language in language_list: 
    
    language_content_ids = content_ids[content_language==language]
    language_topic_ids = topic_ids[topic_language==language]

    # Mean of Similiarities
    sim_matrix = language2sim[language] 
    
    #print(f"Sim MIN: {sim_matrix.min():.3f} - Sim MAX: {sim_matrix.max():.3f} ->  mean for {num_models} models")
    
    sim_matrix /= num_models
    
    #print(f"Sim MIN: {sim_matrix.min():.3f} - Sim MAX: {sim_matrix.max():.3f}")
    
    selection_length = []
    
    for i in range(len(sim_matrix)):
        
        topic = language_topic_ids[i]

        sim = sim_matrix[i]

        th_tmp = sim.max() - config.margin * sim.max()
        p_select = (sim >= th_tmp).squeeze()
        c_choice = set(language_content_ids[p_select.cpu().numpy()].tolist())
        
        topic_list.append(topic)
        content_list.append(" ".join(list(c_choice)))
        selection_length.append(len(c_choice))

    
    selection_length = np.array(selection_length).mean()
        
    print(f"{language.ljust(3)} - ({sim_matrix.shape[0]}x{sim_matrix.shape[1]}) - selected: {selection_length:.0f}")    
    
del sim_matrix, language2sim
gc.collect()

Languages: ['bg', 'en', 'pt']

Calculate Similiarity for: /kaggle/input/transformer-offline-no-weights/LaBSE

Calculate Similiarity for: /kaggle/input/transformer-offline-no-weights/mcontriever-msmarco

Calculate Similiarity for: /kaggle/input/transformer-offline-no-weights/paraphrase-multilingual-mpnet-base-v2

Calculate Similiarity for: /kaggle/input/transformer-offline-no-weights/stsb-xlm-r-multilingual

Calculate Similiarity for: /kaggle/input/transformer-offline-no-weights/xlm-r-100langs-bert-base-nli-stsb-mean-tokens
Calculate mean of similiarities of 5 models per language

Select content per language using dynamic threshold of 0.18:
bg  - (2x6050) - selected: 4
en  - (2x65939) - selected: 1
pt  - (1x10435) - selected: 4


0

In [17]:
df_submission = pd.DataFrame({"topic_id": topic_list,
                              "content_ids": content_list})
    
df_submission.to_csv("submission.csv", index=False)

In [18]:
if config.verbose:
    display(df_submission.head(10))

In [19]:
def f2_score(gt, pd):

    gt = set(gt)
    pd = set(pd)

    if len(pd) == 0:
        precision = 0.0
    else:
        precision = len(gt.intersection(pd)) / len(pd)


    if len(gt) == 0:
        recall = 0.0
    else:
        recall = len(gt.intersection(pd)) / len(gt)


    if (4 * precision + recall) == 0.0:
        f2 = 0.0
    else:
        f2 = (5 * precision * recall) / (4 * precision + recall)
        
    return f2, precision, recall 

In [20]:
if config.output_test:

    df_correlations = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")

    gt_dict = dict()

    topics = df_correlations["topic_id"].values
    content = df_correlations["content_ids"].values

    for i in range(len(topics)):
        content_tmp = content[i].split(" ")
        topic_tmp = topics[i]
        gt_dict[topic_tmp] = content_tmp
        

    scores = []
    precision_list = []
    recall_list = []
 
    for i, t in enumerate(topic_list):
        
        c = content_list[i].split(" ")
        
        gt = gt_dict[t]

        f, precision, recall = f2_score(gt, c)

        scores.append(f)
        precision_list.append(precision)
        recall_list.append(recall)

  
    f2 = np.array(scores).mean() 
    precision = np.array(precision_list).mean()
    recall = np.array(recall_list).mean()

    print("-"*80)
    print("Eval Score: {:.5f} - Precision: {:.5f} - Recall: {:.3f}".format(f2, precision, recall))
    print("-"*80)
    
    