In [None]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=false


In [None]:
class CFG:
    INPUT = '/kaggle/input/learning-equality-curriculum-recommendations'
    MODEL = '/kaggle/input/fb4-stsb-xlm-r-multilingual-ft001/001_stsb-xlm-r-multilingual/stsb-xlm-r-multilingual_fold0_epochs20'
    #MODEL = '/kaggle/input/huggingface-roberta-variants/archive/pytorch-xlm-roberta-base/pytorch-xlm-roberta-base'
    num_workers = 4
    MAX_LEN = 512
    TOP_N = 50
    SEED = 42
    batch_size = 32
    n_folds = 5

In [None]:
def cv_split(train, n_folds, seed):
    kfold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train)):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    display(train.groupby('fold').size())
    return train

In [None]:
# define some helper functions and classes to aid with data traversal


class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

In [None]:
def get_content(id):
    topic = Topic(id)
    context = topic.get_breadcrumbs()
    return context

In [None]:
content = pd.read_csv(f'{CFG.INPUT}/content.csv')
#full_correlations = pd.read_csv('/kaggle/input/fb4-all-minilm-l6-v2-tf006/006_all-MiniLM-L6-v2/Step1CorrelationsFold5.csv')
full_correlations = pd.read_csv('/kaggle/input/fb4-stsb-xlm-r-multilingual-ft001/001_stsb-xlm-r-multilingual/Step1CorrelationsFold5.csv')
topics = pd.read_csv(f'{CFG.INPUT}/topics.csv')
sub_df = pd.read_csv(f'{CFG.INPUT}/sample_submission.csv')


topics_df = pd.read_csv(f'{CFG.INPUT}/topics.csv', index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv(f'{CFG.INPUT}/content.csv', index_col=0).fillna("")
correlations_df = pd.read_csv(f'{CFG.INPUT}/correlations.csv', index_col=0)

topics['title'].fillna("Title does not exist", inplace = True)
content['title'].fillna("Title does not exist", inplace = True)

topics['description'].fillna("Description does not exist", inplace = True)
content['description'].fillna("Description does not exist", inplace = True)

content['text'].fillna("Text does not exist", inplace = True)

topics["context"] = topics["id"].apply(get_content)
#topics["parent"].fillna(9999,inplace=True)
#topics["parent_description"] = topics["parent"].apply(get_parent_description)

content["title"] = content["title"] + "<|=t_sep=|>" + content["description"] + "<|=t_sep=|>" + content["text"]
topics["title"] = topics["title"] + "<|=t_sep=|>" + topics["description"] + "<|=t_sep=|>" + topics["context"]

#kfolds = cv_split(correlations, 10, 42)
#full_correlations = kfolds.copy()
correlations = full_correlations[full_correlations.fold == 0]

sub_topics = topics.merge(sub_df, how = 'inner', left_on = 'id', right_on = 'topic_id')

display(topics.head(3))
display(content.head(3))
display(sub_topics.head(3))

# Fillna titles
#topics['title'].fillna("", inplace = True)
#content['title'].fillna("", inplace = True)
# Fillna descriptions
#topics['description'].fillna("", inplace = True)
#content['description'].fillna("", inplace = True)
# Sort by title length to make inference faster
topics['length'] = topics['title'].apply(lambda x: len(x))
content['length'] = content['title'].apply(lambda x: len(x))
topics.sort_values('length', inplace = True)
content.sort_values('length', inplace = True)
# Drop cols
topics.drop(['description', 'channel', 'category', 'level', 'parent', 'has_content', 'length'], axis = 1, inplace = True)
content.drop(['description', 'kind', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
# Reset index
topics.reset_index(drop = True, inplace = True)
content.reset_index(drop = True, inplace = True)
display(topics.head(3))
display(content.head(3))
print(' ')
print('-' * 50)
print(f"topics.shape: {topics.shape}")
print(f"content.shape: {content.shape}")
print(f"correlations.shape: {correlations.shape}")

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,context
0,t_00004da3a1b2,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...
1,t_000095e03056,Unit 3.3 Enlargements and Similarities<|=t_sep...,Description does not exist,b3f329,aligned,2,en,t_aa32fb6252dc,False,Ghana JHS Curriculum (in progress) >> Junior H...
2,t_00068291e9a4,Entradas e saídas de uma função<|=t_sep=|>Ente...,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Khan Academy (Português (Brasil)) >> Matemátic...


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,93...","Suma 48,029+233,930 mediante el algoritmo está...",video,Text does not exist,es,,
1,c_000087304a9e,Trovare i fattori di un numero<|=t_sep=|>Sal t...,Sal trova i fattori di 120.\n\n,video,Text does not exist,it,,
2,c_0000ad142ddb,Sumar curvas de demanda<|=t_sep=|>Cómo añadir ...,Cómo añadir curvas de demanda\n\n,video,Text does not exist,es,,


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,context,topic_id,content_ids
0,t_00004da3a1b2,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,Entradas e saídas de uma função<|=t_sep=|>Ente...,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Khan Academy (Português (Brasil)) >> Matemátic...,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,Transcripts<|=t_sep=|>Description does not exi...,Description does not exist,6e3ba4,source,3,en,t_4054df11a74e,True,MIT Blossoms >> Engineering >> Flow Charts: Lo...,t_00069b63a70a,c_11a1dc0bfb99


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content,context
0,t_00004da3a1b2,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...
1,t_000095e03056,Unit 3.3 Enlargements and Similarities<|=t_sep...,Description does not exist,b3f329,aligned,2,en,t_aa32fb6252dc,False,Ghana JHS Curriculum (in progress) >> Junior H...
2,t_00068291e9a4,Entradas e saídas de uma função<|=t_sep=|>Ente...,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Khan Academy (Português (Brasil)) >> Matemátic...


Unnamed: 0,id,title,language
0,c_10785744108b,单人防守<|=t_sep=|>-<|=t_sep=|>Text does not exist,zh
1,c_48573e626269,黑洞<|=t_sep=|>黑洞\n\n<|=t_sep=|>Text does not exist,zh
2,c_02b55ea1f1df,Petrini<|=t_sep=|>\n<|=t_sep=|>Text does not e...,fr


 
--------------------------------------------------
topics.shape: (76972, 10)
content.shape: (154047, 3)
correlations.shape: (12304, 3)


In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL)
#tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [None]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        max_length = 64,
        truncation=True,
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

In [None]:
# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs

In [None]:
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.MODEL)
        self.model = AutoModel.from_pretrained(cfg.MODEL, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature

In [None]:
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

In [None]:
# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [None]:
# =========================================================================================
# F2 Score 
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [None]:
# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg, subTrain=False):
    # Create lists for training
    topics_ids = []
    content_ids = []
    topics_languages = []
    content_languages = []
    title1 = []
    title2 = []
    targets = []
    folds = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_language = row['language']
        topics_title = row['title']
        predictions = row['predictions'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        if subTrain==False:
            fold = row['fold']
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            content_language = content.loc[pred, 'language']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
            topics_languages.append(topics_language)
            content_languages.append(content_language)
            if subTrain==False:
                folds.append(fold)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    if subTrain:
        train = pd.DataFrame(
            {'topics_ids': topics_ids, 
             'content_ids': content_ids, 
             'title1': title1, 
             'title2': title2,
             'topic_language': topics_languages, 
             'content_language': content_languages, 
             'target': targets
            }
        )
    else:   
        train = pd.DataFrame(
            {'topics_ids': topics_ids, 
             'content_ids': content_ids, 
             'title1': title1, 
             'title2': title2, 
             'topic_language': topics_languages, 
             'content_language': content_languages,
             'target': targets,
             'fold' : folds
            }
        )
    # Release memory
    del topics_ids, content_ids, title1, title2, targets
    gc.collect()
    return train

In [None]:
def get_indices(df, indices):
    predictions = []
    for k in tqdm(range(len(indices))):
        pred = indices[k]
        p = ' '.join([content.loc[ind, 'id'] for ind in pred])
        predictions.append(p)
    df['predictions'] = predictions
    return df

In [None]:
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, sub_topics, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(content, cfg)
    # Create sub_topics dataset
    sub_topics_dataset = uns_dataset(sub_topics, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    sub_topics_loader = DataLoader(
        sub_topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    sub_topics_preds = get_embeddings(sub_topics_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    sub_topics_preds_gpu = cp.array(sub_topics_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, sub_topics_dataset, topics_loader, content_loader, sub_topics_loader, topics_preds, content_preds, sub_topics_preds
    gc.collect()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.TOP_N, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    sub_indices = neighbors_model.kneighbors(sub_topics_preds_gpu, return_distance = False)
    topics = get_indices(topics, indices)
    sub_topics = get_indices(sub_topics, sub_indices)
    
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, indices, sub_indices, model
    gc.collect()
    return topics, content, sub_topics

In [None]:
def submit_func(df):
    df1 = df.groupby(['topics_ids'])['content_ids'].unique().reset_index()
    df1['content_ids'] = df1['content_ids'].apply(lambda x: ' '.join(x))
    df1.columns = ['topic_id', 'content_ids']
    df0 = pd.Series(df['topics_ids'].unique())
    df0 = df0[~df0.isin(df1['topic_id'])]
    df0 = pd.DataFrame({'topic_id': df0.values, 'content_ids': ""})
    test_r = pd.concat([df1, df0], axis = 0, ignore_index = True)
    test_r.to_csv('submission.csv', index = False)
    return test_r

In [None]:
# Run nearest neighbors
topics, content, sub_topics = get_neighbors(topics, content, sub_topics, CFG)
# Merge with target and comput max positive score
topics_cp = topics.copy()
topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
print(f'Our max positive score is {pos_score}')
f_score = f2_score(topics['content_ids'], topics['predictions'])
print(f'Our f2_score is {f_score}')
# We can delete correlations
del correlations
gc.collect()
# Set id as index for content
content.set_index('id', inplace = True)
# Build training set
topics_full = topics_cp.merge(full_correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
topics_full['predictions'] = topics_full.apply(lambda x: ' '.join(list(set(x.predictions.split(' ') + x.content_ids.split(' ')))) \
                                               if x.fold != 0 else x.predictions, axis = 1)
train = build_training_set(topics_full, content, CFG, subTrain=False)
sub_train = build_training_set(sub_topics, content, CFG, subTrain=True)
print(f'Our training set has {len(train)} rows')
# Save train set to disk to train on another notebook
train.to_csv('xlm-roberta-base_train.csv', index = False)
train.head()

  0%|          | 0/4814 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 
Training KNN model...


Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times



  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

NameError: name 'train' is not defined

In [None]:
#sub = submit_func(sub_train)
#display(sub.head(3))

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_e1e8557d7c61 c_291d470a72e3 c_d35077f2c3d3 c...
1,t_00068291e9a4,c_85e7c0954384 c_5e152a94376f c_035baf9425e0 c...
2,t_00069b63a70a,c_6e4c29c0a363 c_71d6bae3f656 c_6c442539a2ea c...


In [None]:
#sub

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_e1e8557d7c61 c_291d470a72e3 c_d35077f2c3d3 c...
1,t_00068291e9a4,c_85e7c0954384 c_5e152a94376f c_035baf9425e0 c...
2,t_00069b63a70a,c_6e4c29c0a363 c_71d6bae3f656 c_6c442539a2ea c...
3,t_0006d41a73a8,c_27ead125b25e c_5c7376307896 c_5e375cf14c47 c...
4,t_4054df11a74e,c_f2d184a98231 c_3695c5dc1df6 c_8577c06c226a c...
