In [1]:
import os
import torch
import psutil
os.environ["TOKENIZERS_PARALLELISM"]="false"
os.environ["OMP_NUM_THREADS"]="2"
os.environ["OMP_SCHEDULE"]="STATIC"
os.environ["OMP_PROC_BIND"]="CLOSE"

In [2]:
real_core_count = psutil.cpu_count(logical=False)
print("Set Num-Threads to actually core count:", real_core_count)
torch.set_num_threads(real_core_count)

Set Num-Threads to actually core count: 2


In [3]:
import torch.nn.functional as F
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
import os
from dataclasses import dataclass
import numpy as np
import random
from tqdm import tqdm
import time
from datetime import timedelta
import pandas as pd
import gc
import re
import pickle
from collections import Counter

In [4]:
@dataclass
class Configuration:
    '''
    Settings for Eval
    '''
    # Transformer
    transformer1: str = "/kaggle/input/mpnet-tokenizer"  
    checkpoint1: str =  "/kaggle/input/destilled-jit-mpnet/mpnet_model_traced_fold_all.pth" 
        
    content_ids1 =      "/kaggle/input/destilled-jit-mpnet/content_ids_fold_all.pt"
    content_features1 = "/kaggle/input/destilled-jit-mpnet/content_features_fold_all.pt"
    content_language1 = "/kaggle/input/destilled-jit-mpnet/content_language_fold_all.pt"
    
    transformer2: str = "/kaggle/input/labse-tokenizer"  
    checkpoint2: str =  "/kaggle/input/destilled-jit-labse/labse_model_traced_fold_all.pth" 
        
    content_ids2 =      "/kaggle/input/destilled-jit-labse/content_ids_fold_all.pt"
    content_features2 = "/kaggle/input/destilled-jit-labse/content_features_fold_all.pt"
    content_language2 = "/kaggle/input/destilled-jit-labse/content_language_fold_all.pt"  
                       
    # Predict 
    max_len: int = 96          # max len of tokenized topic and content
    batch_size: int = 8        # batch size (keep small for max performance/speed)
    margin: float = 0.16       # dynamic threshold margin
     
    # Testing
    verbose: bool = False      # show progress bar
    speed_test: bool = False   # encode 1000 known contents for speed testing
    output_test: bool = False  # use 100 topics instead of sample submission and evaluate

In [5]:
config = Configuration() 

In [6]:
class EqualDatasetEval(Dataset):
    
    def __init__(self,
                 sorted_input,
                 pad_token_id,
                 max_len):
        
        super().__init__()
        
        self.input = sorted_input
        self.pad_token_id = pad_token_id
        self.max_len = max_len
        
    def __getitem__(self, index):
        return self.input[index]

    def __len__(self):
        return len(self.input)
    
    
    def smart_batching_collate(self, batch):
        
        sequences, targets, language, length = list(zip(*batch))
        
        # calculate real length and not use shuffle length

        b_max_len = min(max(length), self.max_len)
        
        bs = len(sequences)
        
        # no memory reallocation needed if max_len and cut afterwards
        mask = torch.zeros((bs, self.max_len), dtype=torch.float)
        input_ids = torch.full((bs, self.max_len), self.pad_token_id, dtype=torch.long)
        
        for i in range(bs):
            mask[i, :length[i]] = 1
            input_ids[i, :length[i]] = sequences[i]
         
        # cut to actually longest in batch    
        return input_ids[:, :b_max_len], mask[:, :b_max_len], targets, language




def sort_input(text, names, language, tokenizers, max_len):
    
    t0 = time.time()
     
    print('Tokenizing {:,} training samples...'.format(len(text)))

    update_interval = len(text) // 10 + 1

    input_ids = []
    length = []
    for t in text:
        if ((len(input_ids) % update_interval) == 0):
            print('  Tokenized {:,} samples.'.format(len(input_ids)))

        input_id = tokenizers.encode(
            text=t,           
            add_special_tokens=True, 
            max_length=max_len,  
            truncation=True,     
            padding=False,
            return_tensors='pt'
        )   

        input_id = input_id.squeeze()                               
        input_ids.append(input_id)
        length.append(len(input_id))

    print('DONE.')
    print('{:>10,} samples'.format(len(input_ids)))


    sorted_input = sorted(zip(input_ids, names, language, length), key=lambda x: x[-1], reverse=True)
    print('Longest sample:', len(sorted_input[0][0]))
    print('Shortest sample:', len(sorted_input[-1][0]))
        
    t1 = time.time()
    print(f"Time: {t1-t0:.3f} sec")
    
    return sorted_input

In [7]:
def predict(config, model, dataloader):
    
    model.eval()
    
    if config.verbose:
        bar = tqdm(dataloader, total=len(dataloader))
    else:
        bar = dataloader
        
    features_list = []
    ids_list = []
    language_list = []
    
    
    t0 = time.time()
    with torch.no_grad():
        
        for ids, mask, text_id, language in bar:
            
            ids_list.extend(text_id)
            
            language_list.extend(language)
        
            feature = model(ids, mask)

            feature = F.normalize(feature, dim=-1)

            features_list.append(feature)
    
    if config.verbose:
        bar.close()
              
    features = torch.cat(features_list, dim=0) 
    
    ids_list = np.array(ids_list)
    language_list = np.array(language_list)
    
    t1 = time.time()
    
    print(f"Time for feature extraction: {t1-t0:.3f} sec")
          
    return features, ids_list, language_list

In [8]:
def clean(x):
    x = str(x)
    if x != "" and len(x) > 1:
        x = x.strip().strip('\t').strip('\n')
    return x
    
def clean_and_cut(x):
    x = str(x)
    if x != "" and len(x) > 1:
        x = x.strip().strip('\t').strip('\n').replace("", "")
        x = re.sub(r'http\S+', '', x)
        x = " ".join(x.split(" ")[:32])[:256]
              
    return x

In [9]:
df_sample_submission = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv")
df_topics = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/topics.csv").fillna({"title": "", "description": ""})

topic2language = dict(zip(df_topics["id"], df_topics["language"]))

df_sample_submission["language"] = df_sample_submission["topic_id"].map(lambda x : topic2language.get(x, "unk"))

df_topics = df_topics.replace(to_replace= r'\r\n', value= ' ', regex=True)
df_topics = df_topics.replace(to_replace= r'\n', value= ' ', regex=True)

df_topics["title"] = df_topics["title"].map(clean)
df_topics["description"] = df_topics["description"].map(clean)

In [10]:
class Topic:
    def __init__(self, topic_id):
        self.id = topic_id
        

    @property
    def parent(self):
        parent_id = df_topics.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        #return separator.join(reversed([a.title for a in ancestors]))
        return separator.join([a.title for a in ancestors])

    @property
    def children(self):
        return [Topic(child_id) for child_id in df_topics[df_topics.parent == self.id].index]


    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return df_topics.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"

In [11]:
if config.output_test:
    df_correlations = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")
    topic_test = df_correlations["topic_id"].values[:100]
    df_correlations["language"] = df_correlations["topic_id"].map(lambda x : topic2language.get(x, "unk"))
    language_t = df_correlations["language"].values[:100].tolist()
else:
    topic_test = df_sample_submission["topic_id"].values 
    language_t = df_sample_submission["language"].values.tolist()
    
df_topics.set_index("id", inplace=True) 

topic2string = {}

if config.verbose:
    bar = tqdm(topic_test, total=len(topic_test))
else:
    bar = topic_test

for t in bar:
    to = Topic(t)
    string = "{} # {}".format(to.get_breadcrumbs(separator=" # ", include_self=True), to.description)
    topic2string[t] = string

In [12]:
ids_t = topic_test.tolist()

text_t = []
for t in ids_t:
    text_t.append(topic2string[t])

In [13]:
df_content = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/content.csv").fillna({"title": "", "description": "", "text": ""}).set_index("id")   

In [14]:
content_ids1 = np.array(torch.load(config.content_ids1))
content_language1 = np.array(torch.load(config.content_language1))
content_features1 = torch.load(config.content_features1)

content_ids2 = np.array(torch.load(config.content_ids2))
content_language2 = np.array(torch.load(config.content_language2))
content_features2 = torch.load(config.content_features2)

content_id_to_index = dict(zip(content_ids2, np.arange(len(content_ids2))))  

reorder = []

for idx in content_ids1:
    reorder.append(content_id_to_index[idx])
    
reorder = np.array(reorder)

content_ids2 = content_ids2[reorder]
content_language2 = content_language2[reorder]
content_features2 = content_features2[reorder]

if (content_ids1 == content_ids2).sum() == len(content_ids1):
    print("Sucess: Same order of Content1 and Content2")
else:
    print("Error: Content1 and Conten2 have not the same order!!!")
    
    
if (content_language1 == content_language2).sum() == len(content_language1):
    print("Sucess: Same order of Content Language 1 and Content Language 2")
else:
    print("Error: Content Language 1 and Content Language 2 have not the same order!!!")
     
content_ids = content_ids1
content_language = content_language1

Sucess: Same order of Content1 and Content2
Sucess: Same order of Content Language 1 and Content Language 2


In [15]:
# For speed Test on Content
if config.speed_test:
    known_content = set(content_ids[:-1000])
else:
    known_content = set(content_ids)
      
df_content["known"] = df_content.index.map(lambda x : x in known_content)

unknown_content_df = df_content[df_content["known"] == False].copy()
known_content_df = df_content[df_content["known"] == True]

print("Lenght all content:    ", len(df_content))
print("Lenght known content:  ", len(known_content_df))
print("Lenght unknown content:", len(unknown_content_df))

known_content_ids =  set(known_content_df.index)

selector = []

for i, c in enumerate(content_ids):

    if c in known_content_ids:
        selector.append(True)
    else:
        selector.append(False)
        
selector = np.array(selector)


Lenght all content:     154047
Lenght known content:   154047
Lenght unknown content: 0


In [16]:
content_ids_k = content_ids[selector]
content_language_k = content_language[selector]

content_features1_k = content_features1[selector]
content_features2_k = content_features2[selector]

print("Length of known content features:")
print(f"Before selection: {len(content_ids)}")
print(f"After selection:  {len(content_ids_k)}")

Length of known content features:
Before selection: 154047
After selection:  154047


In [17]:
# New Content
unknown_content_df = unknown_content_df.replace(to_replace= r'\r\n', value= ' ', regex=True)
unknown_content_df = unknown_content_df.replace(to_replace= r'\n', value= ' ', regex=True)

unknown_content_df["title"] = unknown_content_df["title"].map(clean)
unknown_content_df["description"] = unknown_content_df["description"].map(clean)
unknown_content_df["text"] = unknown_content_df["text"].map(clean_and_cut)

unknown_content_df["text_cut"] = unknown_content_df["text"].map(lambda x : " ".join(x.split(" ")[:32]))
unknown_content_df["input"] = unknown_content_df["title"] + " # " + unknown_content_df["description"] + " # " +  unknown_content_df["text_cut"]

text_c = unknown_content_df["input"].values.tolist()
ids_c = unknown_content_df.index.tolist()
language_c = unknown_content_df["language"].values.tolist()

print(f"Lenght of unknown content to process: {len(text_c)}")

Lenght of unknown content to process: 0


In [18]:
tokenizer1 = AutoTokenizer.from_pretrained(config.transformer1)
tokenizer2 = AutoTokenizer.from_pretrained(config.transformer2)

sorted_topics1 = sort_input(text_t, ids_t, language_t, tokenizer1, config.max_len)
sorted_topics2 = sort_input(text_t, ids_t, language_t, tokenizer2, config.max_len)

if len(text_c) > 0:
    sorted_content1 = sort_input(text_c, ids_c, language_c, tokenizer1, config.max_len)
    sorted_content2 = sort_input(text_c, ids_c, language_c, tokenizer2, config.max_len)

Tokenizing 5 training samples...
  Tokenized 0 samples.
  Tokenized 1 samples.
  Tokenized 2 samples.
  Tokenized 3 samples.
  Tokenized 4 samples.
DONE.
         5 samples
Longest sample: 96
Shortest sample: 25
Time: 0.010 sec
Tokenizing 5 training samples...
  Tokenized 0 samples.
  Tokenized 1 samples.
  Tokenized 2 samples.
  Tokenized 3 samples.
  Tokenized 4 samples.
DONE.
         5 samples
Longest sample: 96
Shortest sample: 20
Time: 0.004 sec


In [19]:
print("\n{}[Model: {}]{}".format(20*"-", config.transformer1, 20*"-"))
model = torch.jit.load(config.checkpoint1)
model.eval()
              
val_dataset_topic = EqualDatasetEval(sorted_topics1,
                                     pad_token_id = tokenizer1.pad_token_id,
                                     max_len=config.max_len)


val_loader_topic = DataLoader(dataset=val_dataset_topic, 
                              batch_size=config.batch_size, 
                              shuffle=False,
                              collate_fn=val_dataset_topic.smart_batching_collate
                              )

topic_features1, topic_ids1, topic_language1 = predict(config, model, val_loader_topic)

    
if len(text_c) > 0:

    val_dataset_content = EqualDatasetEval(sorted_content1,
                                           pad_token_id = tokenizer1.pad_token_id,
                                           max_len=config.max_len)


    val_loader_content = DataLoader(dataset=val_dataset_content, 
                                    batch_size=config.batch_size, 
                                    shuffle=False,   
                                    collate_fn=val_dataset_content.smart_batching_collate)

    content_features1_uk, content_ids1_uk, content_language1_uk = predict(config, model, val_loader_content)  

    # Add known Content
    content_ids1 = np.concatenate([content_ids_k, content_ids1_uk])
    content_language1 = np.concatenate([content_language_k, content_language1_uk])
    content_features1 = torch.cat([content_features1_k, content_features1_uk], dim=0)

del model, val_dataset_topic, val_loader_topic
gc.collect()        


--------------------[Model: /kaggle/input/mpnet-tokenizer]--------------------
Time for feature extraction: 0.688 sec


119

In [20]:
if config.verbose:
    print(topic_features1.shape)
    print(content_features1.shape)

In [21]:
print("\n{}[Model: {}]{}".format(20*"-", config.transformer2, 20*"-"))
model = torch.jit.load(config.checkpoint2)
model.eval()
              
val_dataset_topic = EqualDatasetEval(sorted_topics2,
                                     pad_token_id = tokenizer2.pad_token_id,
                                     max_len=config.max_len)


val_loader_topic = DataLoader(dataset=val_dataset_topic, 
                              batch_size=config.batch_size, 
                              shuffle=False,
                              collate_fn=val_dataset_topic.smart_batching_collate
                              )

topic_features2, topic_ids2, topic_language2 = predict(config, model, val_loader_topic)


if (topic_ids1 == topic_ids2).sum() != len(topic_ids2):
    
    topic_id_to_index = dict(zip(topic_ids2, np.arange(len(topic_ids2))))  

    reorder = []

    for idx in topic_ids1:
        reorder.append(topic_id_to_index[idx])

    reorder = np.array(reorder)

    topic_ids2 = topic_ids2[reorder]
    topic_language2 = topic_language2[reorder]
    topic_features2 = topic_features2[reorder]
    

    
if len(text_c) > 0:

    val_dataset_content = EqualDatasetEval(sorted_content2,
                                           pad_token_id = tokenizer2.pad_token_id,
                                           max_len=config.max_len)


    val_loader_content = DataLoader(dataset=val_dataset_content, 
                                    batch_size=config.batch_size, 
                                    shuffle=False,   
                                    collate_fn=val_dataset_content.smart_batching_collate)

    content_features2_uk, content_ids2_uk, content_language2_uk = predict(config, model, val_loader_content)  
    
    

    if (content_ids1_uk == content_ids2_uk).sum() != len(content_ids2_uk):
    
        content_id_to_index = dict(zip(content_ids2_uk, np.arange(len(content_ids2_uk))))  

        reorder = []

        for idx in content_ids1_uk:
            reorder.append(content_id_to_index[idx])

        reorder = np.array(reorder)

        content_ids2_uk = content_ids2_uk[reorder]
        content_language2_uk = content_language2_uk[reorder]
        content_features2_uk = content_features2_uk[reorder]
    
    
    # Add known Content
    content_ids2 = np.concatenate([content_ids_k, content_ids2_uk])
    content_language2 = np.concatenate([content_language_k, content_language2_uk])
    content_features2 = torch.cat([content_features2_k, content_features2_uk], dim=0)

del model, val_dataset_topic, val_loader_topic
gc.collect()  


--------------------[Model: /kaggle/input/labse-tokenizer]--------------------
Time for feature extraction: 0.456 sec


0

In [22]:
if config.verbose:
    print(topic_features2.shape)
    print(content_features2.shape)

In [23]:
if (topic_ids1 == topic_ids2).sum() == len(topic_ids1):
    print("Sucess: Same order of Content1 and Content2")
else:
    print("Error: Topic1 and Topic2 have not the same order!!!")
    
    
if (topic_language1 == topic_language2).sum() == len(topic_language1):
    print("Sucess: Same order of Topic Language 1 and Topic Language 2")
else:
    print("Error: Topic Language 1 and Topic Language 2 have not the same order!!!")
    
topic_ids = topic_ids1
topic_language = topic_language1

Sucess: Same order of Content1 and Content2
Sucess: Same order of Topic Language 1 and Topic Language 2


In [24]:
if (content_ids1 == content_ids2).sum() == len(content_ids1):
    print("Sucess: Same order of Content1 and Content2")
else:
    print("Error: Content1 and Conten2 have not the same order!!!")
    
    
if (content_language1 == content_language2).sum() == len(content_language1):
    print("Sucess: Same order of Content Language 1 and Content Language 2")
else:
    print("Error: Content Language 1 and Content Language 2 have not the same order!!!")
    
content_ids = content_ids1
content_language = content_language1

Sucess: Same order of Content1 and Content2
Sucess: Same order of Content Language 1 and Content Language 2


In [25]:
language_count = Counter(topic_language).most_common()
language_list = [l for l, _ in language_count]

print("Languages:", language_list)

Languages: ['en', 'bg', 'pt']


In [26]:
topic_list = []
content_list = []


print("Predict:")
for language in language_list:
    
    # Index for language
    topic_index_language = topic_language==language
    content_index_language = content_language==language
    
    # Ids
    topic_ids_language = topic_ids[topic_index_language]
    content_ids_language = content_ids[content_index_language]
    
    # Topic features
    topic_features1_language = topic_features1[topic_index_language]
    topic_features2_language = topic_features2[topic_index_language]
    
    # Content features
    content_features1_language = content_features1[content_index_language]
    content_features2_language = content_features2[content_index_language]
    

    if len(topic_ids_language) > 0 and len(content_ids_language) > 0:
        
        if topic_features1_language.dim() == 1:
            topic_features1_language = topic_features1_language.unsqueeze(0)
            
        if topic_features2_language.dim() == 1:    
            topic_features2_language = topic_features2_language.unsqueeze(0)

        if content_features1_language.dim() == 1:
            content_features1_language = content_features1_language.unsqueeze(0)
            
        if content_features2_language.dim() == 1:    
            content_features2_language = content_features2_language.unsqueeze(0)
        
   
        sim_matrix1 = topic_features1_language @ content_features1_language.T  
        
        sim_matrix2 = topic_features2_language @ content_features2_language.T 
        
        sim_matrix = (sim_matrix1 + sim_matrix2) / 2
        
        selection_length = []
        
        for i in range(len(sim_matrix)):
        
            topic = topic_ids_language[i]

            sim = sim_matrix[i]

            th_tmp = sim.max() - config.margin * sim.max()

            p_select = (sim >= th_tmp).squeeze()

            c_choice = content_ids_language[p_select.numpy()]

            topic_list.append(topic)
            content_list.append(" ".join(list(c_choice)))
            selection_length.append(len(c_choice))
        
        selection_length = np.array(selection_length).mean()
        
        print(f"{language.ljust(3)} - ({sim_matrix.shape[0]}x{sim_matrix.shape[1]}) - selected: {selection_length:.0f}")

Predict:
en  - (2x65939) - selected: 1
bg  - (2x6050) - selected: 4
pt  - (1x10435) - selected: 5


In [27]:
df_submission = pd.DataFrame({"topic_id": topic_list,
                              "content_ids": content_list})
    
df_submission.to_csv("submission.csv", index=False)

In [28]:
if config.verbose:
    display(df_submission.head(10))

In [29]:
def f2_score(gt, pd):

    gt = set(gt)
    pd = set(pd)

    if len(pd) == 0:
        precision = 0.0
    else:
        precision = len(gt.intersection(pd)) / len(pd)


    if len(gt) == 0:
        recall = 0.0
    else:
        recall = len(gt.intersection(pd)) / len(gt)


    if (4 * precision + recall) == 0.0:
        f2 = 0.0
    else:
        f2 = (5 * precision * recall) / (4 * precision + recall)
        
    return f2, precision, recall 

In [30]:
if config.output_test:

    df_correlations = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")

    gt_dict = dict()

    topics = df_correlations["topic_id"].values
    content = df_correlations["content_ids"].values

    for i in range(len(topics)):
        content_tmp = content[i].split(" ")
        topic_tmp = topics[i]
        gt_dict[topic_tmp] = content_tmp
        

    scores = []
    precision_list = []
    recall_list = []
 
    for i, t in enumerate(topic_list):
        
        c = content_list[i].split(" ")
        
        gt = gt_dict[t]


        f, precision, recall = f2_score(gt, c)

        scores.append(f)
        precision_list.append(precision)
        recall_list.append(recall)

  
    f2 = np.array(scores).mean() 
    precision = np.array(precision_list).mean()
    recall = np.array(recall_list).mean()

    print("-"*80)
    print("Eval Score: {:.5f} - Precision: {:.5f} - Recall: {:.3f}".format(f2, precision, recall))
    print("-"*80)
    