
<h3> Some materials needed for this step:</h3>
    
* Finetuned model from Step 1

* Kfold dataset from Step 1
    
    
<h3> Note:</h3>
    
* The 0's fold is used for validation

* After generating Top-K candidates for training in the next step, we need to use the correlations file to add more label 1 in the training set, because although we get a very high max positive score at stage 1 for Top-K, some topics may have no label 1
    
**Reference**: https://www.kaggle.com/code/ragnar123/lecr-unsupervised-train-set-public 

In [1]:
# =========================================================================================
# Libraries
# =========================================================================================
import numpy as np
import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
#import cupy as cp
from sklearn.model_selection import KFold
from sentence_transformers import SentenceTransformer
from unidecode import unidecode
import re
from annoy import AnnoyIndex
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# =========================================================================================
# Configurations
# =========================================================================================
class CFG1:
    num_workers = 24
    model = 'model/stage-1-paraphrase-multilingual-mpnet-base-v2-epochs18'
    tokenizer = AutoTokenizer.from_pretrained(model)
    batch_size = 128
    top_n = 50
    seed = 42
    used_columns = ['title']
    
class CFG9:
    num_workers = 24
    model = "model/paraphrase-multilingual-mpnet-base-v2-epochs-1-tuned/"
    tokenizer = AutoTokenizer.from_pretrained(model)
    batch_size = 128
    top_n = 50
    seed = 42
    used_columns = 'title'
    max_length = 50
    
CFG_list = [CFG9]
    
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    topics = pd.read_csv('data/topics.csv')
    #topics = topics[topics.category != 'source']
    content = pd.read_csv('data/content.csv')
    correlations = pd.read_csv('data/kfold_correlations.csv')
    #kfolds = cv_split(correlations, 5, 42)
    #kfolds.to_csv('kfold_correlations_subset.csv',index=False)
    correlations = correlations[correlations.fold == 0]
    topics = topics.merge(correlations, how = 'inner', left_on = 'id', right_on = 'topic_id')
    #correlations = pd.read_csv('/kaggle/input/lcrs-kfolds/kfold_correlations.csv')
    #correlations = correlations[correlations.fold == 0]
    # Fillna titles 
    #topics = preprocess(topics,['title','description'])
    #content = preprocess(content,['title','description'])
    
    topics['title'].fillna("no title", inplace = True)
    content['title'].fillna("no title", inplace = True)
    topics['description'].fillna("no description", inplace = True)
    content['description'].fillna("no description", inplace = True)
    
    content['text'].fillna("", inplace = True)
    content['license'].fillna("", inplace = True)
    
    
    topics['title'] =  topics['title'] + '. Language_' + topics['language'] + ". Description: " + topics['description']
    content['title'] =  content['title'] +  '. Language_' + content['language'] + ". Description: " + content['description']
    
    #best for now
    #topics['title'] =  topics['title'] + '. Language_' + topics['language']
    #content['title'] =  content['title'] + '. Language_' + content['language']
    
    
    
    #topics['title'] =  '[CLS] ' + topics['title'] + ' <|=t_sep=|> '  + topics['description'] + ' [SEP]'
    #content['title'] = '[CLS] ' + content['title'] + ' <|=t_sep=|> '  + content['description'] + ' <|=t_sep=|> ' + content['text'] + ' <|=t_sep=|> ' + content['kind'] + ' [SEP]'
    #corr['text1'] = corr['text1'].progress_apply(lambda x: white_spaces(x))
    #corr['text2'] = corr['text2'].progress_apply(lambda x: white_spaces(x))
    
    #topics['title'] =  '[CLS] ' + topics['title'].str.lower() + ' <|=t_sep=|> '  + topics['description'] + ' [SEP]'
    #content['title'] = '[CLS] ' + content['title'].str.lower() + ' <|=t_sep=|> ' + content['description'] + '<|=t_sep=|> ' + content['text'] + ' <|=t_sep=|> ' + content['license'] + ' [SEP]'
    #topics['title'] = topics['title'].progress_apply(lambda x: white_spaces(x))
    #content['title'] = content['title'].progress_apply(lambda x: white_spaces(x))
    
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].astype(str).apply(lambda x: len(x))
    content['length'] = content['title'].astype(str).apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    
    #language_filtering = (topics.language.value_counts(normalize=True)>0.).reset_index()
    #language_filtering.columns=['language','shape']
    #language_filtering.loc[language_filtering["shape"] == False, "language_final"] = "Other"
    #language_filtering.loc[language_filtering["shape"] == True, "language_final"] = language_filtering['language']
    #content = content.merge(language_filtering[['language','language_final']])
    #topics = topics.merge(language_filtering[['language','language_final']])
    
    # Drop cols
    topics.drop(['description', 'channel', 'category', 'level', 'parent', 'has_content', 'length'], axis = 1, inplace = True)
    content.drop(['description', 'kind', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, content, correlations

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        max_length = cfg.max_length,
        truncation=True,
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

# =========================================================================================
# F2 Score 
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)
# ===========================================================================================


# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    title1 = []
    title2 = []
    targets = []
    folds = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_title = row['title']
        predictions = row['predictions'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        fold = row['fold']
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
            folds.append(fold)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1, 
         'title2': title2, 
         'target': targets,
         'fold' : folds}
    )
    # Release memory
    del topics_ids, content_ids, title1, title2, targets
    gc.collect()
    return train
    
# =========================================================================================
# Get neighbors
# =========================================================================================

def cv_split(train, n_folds, seed):
    kfold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train)):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

def white_spaces(x):
    return re.sub(' +', ' ', x)

def preprocess(df,columns):
    for col in columns:
        df[col] = df[col].fillna("")
        #df[col] = df[col].str.strip('123.!? \n\t')
        #df[col] = df[col].str[:100]
    return df

def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_embeds = get_embeddings(topics_loader, model, device)
    content_embeds = get_embeddings(content_loader, model, device)
    return topics_embeds,content_embeds

env: TOKENIZERS_PARALLELISM=false


In [2]:
def max_recall_calculator(CFG_list,top_k=50):
    topics_embeds_all = []
    content_embeds_all = []
    for _idx, CFG in enumerate(CFG_list):
        print(f'trying:{CFG.model}')
        topics, content, correlations = read_data(CFG)
        topics_embeds,content_embeds = get_neighbors(topics,content,CFG)
        topics_embeds_all.append(topics_embeds)
        content_embeds_all.append(content_embeds)
    if len(CFG_list) > 1:
        topics_embeds = np.concatenate(topics_embeds_all,axis=1)
        content_embeds = np.concatenate(content_embeds_all,axis=1)
    print("Embedding Shapes:", topics_embeds.shape, content_embeds.shape)
    topics.rename(columns=lambda x: "topic_" + x, inplace=True)
    content.rename(columns=lambda x: "content_" + x, inplace=True)
    


    
    model = AnnoyIndex(topics_embeds.shape[1], 'angular')

    for idx in tqdm(range(content_embeds.shape[0])):
        model.add_item(idx, content_embeds[idx])
    print('Training Annoy Model...')
    model.build(300)
    print('Annoy Model, Done.')
    print(f'Finding Nearest {top_k} contents for every topic...')
    dfs = []
    for topic_idx in tqdm(range(topics_embeds.shape[0])):
        content_idx = model.get_nns_by_vector(topics_embeds[topic_idx],n=top_k)
        df_temp = content[content.index.isin(content_idx)]
        df_temp['topic_id'] = list(topics[topics.index.isin([topic_idx])].topic_id)[0]
        df_temp['topic_title'] = list(topics[topics.index.isin([topic_idx])].topic_title)[0]
        dfs.append(df_temp)
    candidates = pd.concat(dfs).reset_index(drop=True)
    aa = candidates.groupby(['topic_id'])['content_id'].agg(list).reset_index()
    aa['predictions'] = aa.content_id.apply(lambda x: ' '.join(x))
    aa.drop('content_id',axis=1,inplace=True)
    topics_test = aa.merge(correlations, how = 'inner', left_on = ['topic_id'], right_on = ['topic_id'])
    pos_score = get_pos_score(topics_test['content_ids'], topics_test['predictions'])
    print('Validation df shape:',topics_test.shape)
    print(f'Model:{CFG.model}')
    print(f'Our max positive score is {pos_score}')

    f_score = f2_score(topics_test['content_ids'], topics_test['predictions'])
    print(f'Our f2_score is {f_score}')
    
    return {'max_pos_score':pos_score,'f2_score':f_score,'model':CFG.model,'top_k':top_k}

In [3]:
result = max_recall_calculator(CFG_list)

trying:model/paraphrase-multilingual-mpnet-base-v2-epochs-1-tuned/
 
--------------------------------------------------
topics.shape: (12304, 6)
content.shape: (154047, 3)
correlations.shape: (12304, 3)


  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/1204 [00:00<?, ?it/s]

Embedding Shapes: (12304, 768) (154047, 768)


  0%|          | 0/154047 [00:00<?, ?it/s]

Training Annoy Model...
Annoy Model, Done.
Finding Nearest 50 contents for every topic...


  0%|          | 0/12304 [00:00<?, ?it/s]

Validation df shape: (12304, 4)
Model:model/paraphrase-multilingual-mpnet-base-v2-epochs-1-tuned/
Our max positive score is 0.7026
Our f2_score is 0.1933


In [4]:
result

{'max_pos_score': 0.7026,
 'f2_score': 0.1933,
 'model': 'model/paraphrase-multilingual-mpnet-base-v2-epochs-1-tuned/',
 'top_k': 50}

In [2]:
topics_embeds_all = []
content_embeds_all = []
for _idx, CFG in enumerate(CFG_list):
    print(f'trying:{CFG.model}')
    topics, content, correlations = read_data(CFG)
    topics_embeds,content_embeds = get_neighbors(topics,content,CFG)
    topics_embeds_all.append(topics_embeds)
    content_embeds_all.append(content_embeds)

trying:model/paraphrase-multilingual-mpnet-base-v2-epochs-1-tuned/
 
--------------------------------------------------
topics.shape: (12304, 6)
content.shape: (154047, 3)
correlations.shape: (12304, 3)


  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

In [3]:
#topics_embeds = np.concatenate(topics_embeds_all,axis=1)
#content_embeds = np.concatenate(content_embeds_all,axis=1)

In [4]:
topics_embeds.shape, content_embeds.shape

((12304, 768), (154047, 768))

In [5]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [6]:
content

Unnamed: 0,content_id,content_title,content_language
0,c_7fa2a802de7a,胃,zh
1,c_bc2e9694a1c8,连加,zh
2,c_3d904b65e551,故事,zh
3,c_f8c502b2f538,故事,zh
4,c_7997e81e3554,早产,zh
...,...,...,...
154042,c_eae464c625ea,TI-AIE: Perspective on leadership: building a ...,en
154043,c_e92281698de8,ملخص الوحدة الثانية ( المكانيكا ) لمادة الفيزي...,ar
154044,c_88f85461d72a,Actividad 1 - Valoramos la importancia de la a...,es
154045,c_ee7616e33ff1,11.3D: Harmful Effects Associated with Abnorma...,en


In [7]:
topics

Unnamed: 0,topic_id,topic_title,topic_language,topic_topic_id,topic_content_ids,topic_fold
0,t_59bf60f88801,R,en,t_59bf60f88801,c_0135861af4a7 c_0272e668d49f c_03a786f7332e c...,0
1,t_8f641744d392,ভর,bn,t_8f641744d392,c_3ca302a4367b c_8c93d5a01c2c,0
2,t_22507be124bb,减法,zh,t_22507be124bb,c_0bb529f0a613 c_2544c1e99b16 c_5648687107c0 c...,0
3,t_9db14a7a7e1c,介绍,zh,t_9db14a7a7e1c,c_18ce8aa3de15 c_360927b2e997 c_5c0958e3a452 c...,0
4,t_65bc6caaa58c,દળ,gu,t_65bc6caaa58c,c_db2d047b7e3d,0
...,...,...,...,...,...,...
12299,t_129dab82d181,6.6.2 Diseases and defects of the circulatory ...,en,t_129dab82d181,c_1ef4c0bf1934 c_32acdfa77885 c_4d1aff7ece2f c...,0
12300,t_c3c9fec7382a,Introduction aux pourcentages (représentation ...,fr,t_c3c9fec7382a,c_11dc49860c34 c_2883d482528b c_31d7556f1011 c...,0
12301,t_33003d8b9b82,Unidad II: Hacer una Unidad de Reclamaciones B...,es,t_33003d8b9b82,c_41ffc54df71d c_51e91c95794e c_5ac757ba390b c...,0
12302,t_5406f9c594d3,Unidad IV: Construyendo una Unidad de Argument...,es,t_5406f9c594d3,c_03a353ae3f02 c_f63caad465f7,0


In [8]:
%%time

from annoy import AnnoyIndex
model = AnnoyIndex(topics_embeds.shape[1], 'angular')

for idx in tqdm(range(content_embeds.shape[0])):
    model.add_item(idx, content_embeds[idx])
    
model.build(300)

  0%|          | 0/154047 [00:00<?, ?it/s]

CPU times: user 14min 4s, sys: 2.32 s, total: 14min 6s
Wall time: 43.2 s


True

In [11]:
dfs = []
#for topic_idx in tqdm(range(topics_embeds.shape[0])):
content_idx = model.get_nns_by_vector(topics_embeds[0],n=300)
df_temp = content[content.index.isin(content_idx)]
df_temp['topic_id'] = list(topics[topics.index.isin([0])].topic_id)[0]
df_temp['topic_title'] = list(topics[topics.index.isin([0])].topic_title)[0]
df_temp['topic_language'] = list(topics[topics.index.isin([0])].topic_language)[0]
df_temp['is_language'] = (df_temp['topic_language'] == df_temp['content_language'])
df_temp = df_temp[df_temp.is_language==True]
dfs.append(df_temp)

In [14]:
df_temp[df_temp.is_language==True].shape

(37, 7)

In [12]:
df_temp

Unnamed: 0,content_id,content_title,content_language,topic_id,topic_title,topic_language,is_language
103,c_5e6e87821eb7,200,en,t_59bf60f88801,R,en,True
120,c_0b9e1b3404ef,250,en,t_59bf60f88801,R,en,True
139,c_91673594eb1c,Lis,pl,t_59bf60f88801,R,en,False
168,c_b30ed81d9dcf,200,en,t_59bf60f88801,R,en,True
181,c_c81dca1ac413,250,en,t_59bf60f88801,R,en,True
...,...,...,...,...,...,...,...
130525,c_0d4309433138,Visualização de um espaço coluna como um plano...,pt,t_59bf60f88801,R,en,False
135451,c_6665a72199cb,Definição de um plano em R3 com um ponto e vet...,pt,t_59bf60f88801,R,en,False
138185,c_3ab0282aff62,"Exemplo de sistema de nomeação R,S (Cahn-Ingol...",pt,t_59bf60f88801,R,en,False
138530,c_ca3a4e1f4476,Introdução aos resíduos e à regressão de mínim...,pt,t_59bf60f88801,R,en,False


In [44]:
content

Unnamed: 0,content_id,content_title,content_language
0,c_7fa2a802de7a,胃,zh
1,c_bc2e9694a1c8,连加,zh
2,c_3d904b65e551,故事,zh
3,c_f8c502b2f538,故事,zh
4,c_7997e81e3554,早产,zh
...,...,...,...
154042,c_eae464c625ea,TI-AIE: Perspective on leadership: building a ...,en
154043,c_e92281698de8,ملخص الوحدة الثانية ( المكانيكا ) لمادة الفيزي...,ar
154044,c_88f85461d72a,Actividad 1 - Valoramos la importancia de la a...,es
154045,c_ee7616e33ff1,11.3D: Harmful Effects Associated with Abnorma...,en


In [99]:
list(topics[topics.index.isin([233])].topic_title)[0]

'Cours'

In [100]:
list(topics[topics.index.isin([233])].topic_id)[0]

't_1b350336a4a6'

In [98]:
list(topics[topics.index.isin([233])].topic_title)[0]

'Cours'

In [103]:
topic_idx = 234
content_idx = model.get_nns_by_vector(topics_embeds[topic_idx],n=500)
df_temp = content[content.index.isin(content_idx)]
df_temp['topic_id'] = list(topics[topics.index.isin([topic_idx])].topic_id)[0]
df_temp['topic_title'] = list(topics[topics.index.isin([topic_idx])].topic_title)[0]
df_temp['topic_language'] = list(topics[topics.index.isin([topic_idx])].topic_language)[0]
df_temp['is_language'] = (df_temp['topic_language'] == df_temp['content_language'])

In [104]:
df_temp

Unnamed: 0,content_id,content_title,content_language,topic_id,topic_title,topic_language,is_language
858,c_3724f2d86d96,تاريخ,ar,t_8daa641fd2c7,ریاضی,ur,False
872,c_83892e36e51c,تقييم,ar,t_8daa641fd2c7,ریاضی,ur,False
921,c_98d95645d756,تاريخ,ar,t_8daa641fd2c7,ریاضی,ur,False
980,c_8810239f6ab3,تاريخ,ar,t_8daa641fd2c7,ریاضی,ur,False
1000,c_ddfb9f86b7e8,تاريخ,ar,t_8daa641fd2c7,ریاضی,ur,False
...,...,...,...,...,...,...,...
148456,c_f972b226746b,مبارة القمة 3 | العمليات على المجموعات الرياضي...,ar,t_8daa641fd2c7,ریاضی,ur,False
148993,c_6eef3fb8f3a2,إجراء عمليّة الضرب المتكرّر على الأعداد النسبي...,ar,t_8daa641fd2c7,ریاضی,ur,False
150193,c_4a070e4874b3,رسم ارتفاعات المثلث | اساسيات من ابتدائي في ال...,ar,t_8daa641fd2c7,ریاضی,ur,False
151963,c_6c1980f8b9d1,يعني ايه المجموعة الرياضية ؟ | اساسيات من ابتد...,ar,t_8daa641fd2c7,ریاضی,ur,False


In [124]:
dfs = []
for topic_idx in tqdm(range(topics_embeds.shape[0])):
    content_idx = model.get_nns_by_vector(topics_embeds[topic_idx],n=5)
    df_temp = content[content.index.isin(content_idx)]
    df_temp['topic_id'] = list(topics[topics.index.isin([topic_idx])].topic_id)[0]
    df_temp['topic_title'] = list(topics[topics.index.isin([topic_idx])].topic_title)[0]
    df_temp['topic_language'] = list(topics[topics.index.isin([topic_idx])].topic_language)[0]
    #df_temp['is_language'] = (df_temp['topic_language'] == df_temp['content_language'])
    #df_temp = df_temp[(df_temp.is_language==True) | (df_temp.content_language=='en') | (df_temp.content_language=='es') | (df_temp.content_language=='fr') | (df_temp.content_language=='ar')]
    if df_temp.shape[0] == 0:
        break
    #df_temp = df_temp.head(50)
    #display(df_temp)
    dfs.append(df_temp)

  0%|          | 0/12304 [00:00<?, ?it/s]

In [125]:
candidates = pd.concat(dfs).reset_index(drop=True)

In [126]:
(candidates['content_language'] == candidates['topic_language']).value_counts()

True     56872
False     4648
dtype: int64

In [127]:
candidates = pd.concat(dfs).reset_index(drop=True)
aa = candidates.groupby(['topic_id'])['content_id'].agg(list).reset_index()
aa['predictions'] = aa.content_id.apply(lambda x: ' '.join(x))
aa.drop('content_id',axis=1,inplace=True)

In [128]:
topics_test = aa.merge(correlations, how = 'inner', left_on = ['topic_id'], right_on = ['topic_id'])

In [129]:
topics_test#.topic_id.nunique()

Unnamed: 0,topic_id,predictions,content_ids,fold
0,t_0008768bdee6,c_71d46951e007 c_563674a55312 c_fea8d8d7de7c c...,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4,0
1,t_000d1fb3f2f5,c_906610d0fd21 c_8f302958626f c_80a7c553ffe4 c...,c_07f1d0eec4b2 c_15a6fb858696 c_175e9db3fc44 c...,0
2,t_000feba42136,c_d59718802908 c_304ee4f59410 c_badfe9679991 c...,c_2bbc650030f4 c_304ee4f59410,0
3,t_0028ead4dc26,c_e01965f1924d c_7a838ef49896 c_01e269c23b79 c...,c_2a6869449f42 c_72d5e9f80f14 c_a4651c228714 c...,0
4,t_002d1624e059,c_a62828ad452f c_f7416b2b8869 c_0593111b69a1 c...,c_af05b751efdf,0
...,...,...,...,...
12299,t_ffc71a181765,c_7a48db32808c c_65f36b4a0c88 c_e91837e1b025 c...,c_d7be46af5e2b c_ea92fdd9899a,0
12300,t_ffda8a99f58c,c_b5fa48a39ab9 c_20e28a5bb042 c_42c5213542d7 c...,c_1879a229272d c_31fe591591c3 c_9244a22d75a9 c...,0
12301,t_ffdbcde36d56,c_c5ebc4115b11 c_dda03d5bb211 c_fb17ae04bc51 c...,c_40368ed38da9 c_47fd674b0fa0 c_567e58a3b973 c...,0
12302,t_fff05585df72,c_43d429da4229 c_bee5ff4cce06 c_88bc7ee86c8b c...,c_6f255c97f381 c_743e6319d5ae c_88bc7ee86c8b c...,0


In [130]:
pos_score = get_pos_score(topics_test['content_ids'], topics_test['predictions'])
print(f'Our max positive score is {pos_score}')

f_score = f2_score(topics_test['content_ids'], topics_test['predictions'])
print(f'Our f2_score is {f_score}')

#Validation
#Our max positive score is 0.78403
#Our f2_score is 0.2239

Our max positive score is 0.35399
Our f2_score is 0.2895


In [None]:
Our max positive score is 0.42087
Our f2_score is 0.1054

In [None]:
topics_test['predictions'] = topics_test.predictions.apply(lambda x: x.split(' '))
topics_test['content_ids'] = topics_test.content_ids.apply(lambda x: x.split(' '))
gt = topics_test[['topic_id','content_ids','fold']].explode('content_ids')
preds = topics_test[['topic_id','predictions','fold']].explode('predictions')
candidates_df = preds.merge(gt[['topic_id','content_ids']],how='left',left_on=['topic_id','predictions'], right_on=['topic_id','content_ids'])
candidates_df.loc[candidates_df.content_ids.isnull(),'target'] = 0
['target'] = candidates_df.target.fillna(1)
candidates_df.drop('content_ids',axis=1,inplace=True)

In [79]:
candidates_df = candidates_df.rename(columns={'predictions':'content_id'})

In [81]:
candidates_df = candidates_df.merge(topics[['topic_id','topic_title']],on='topic_id')
candidates_df = candidates_df.merge(content[['content_id','content_title']],on='content_id')

In [82]:
candidates_df

Unnamed: 0,topic_id,content_id,fold,target,topic_title,content_title
0,t_00004da3a1b2,c_c1de9b7501b7,1,0.0,Откриването на резисторите,Капацитет
1,t_261fb7043ad1,c_c1de9b7501b7,3,0.0,Електричен ток и електрично напрежение,Капацитет
2,t_3a1f5ae9f991,c_c1de9b7501b7,0,1.0,Вериги с кондензатори,Капацитет
3,t_46415b46914b,c_c1de9b7501b7,4,1.0,Електростатична индукция и кондензатори,Капацитет
4,t_a76d0d45b2e9,c_c1de9b7501b7,2,0.0,Електричен ток: преговор,Капацитет
...,...,...,...,...,...,...
3075845,t_fff05585df72,c_743e6319d5ae,0,1.0,11: Systems of Equations and Inequalities,11.9: Solving Systems with Cramer's Rule
3075846,t_fff05585df72,c_d9bbe8422c6b,0,1.0,11: Systems of Equations and Inequalities,11.0: Prelude to Systems of Equations and Ineq...
3075847,t_fff9e5407d13,c_b43d07ea6eef,4,0.0,NA_U06 - El periódico,La noria
3075848,t_fff9e5407d13,c_d64037a72376,4,1.0,NA_U06 - El periódico,Introducción: El periódico


In [20]:
pd.read_csv('data/train_top50_fold0_cv_with_groundtruth_final_72044.csv')#.columns

Unnamed: 0,topics_ids,content_ids,title1,title2,target,fold
0,t_3d9ad9931021,c_8a2c8da77d0c,,Agenda,1,3
1,t_3d9ad9931021,c_3f51421a7c85,,ABCD,0,3
2,t_3d9ad9931021,c_db7818729577,,,0,3
3,t_3d9ad9931021,c_eb7d5e2e1744,,Simon,0,3
4,t_3d9ad9931021,c_60dd2fc8a271,,Ihab,0,3
...,...,...,...,...,...,...
3119822,t_70da08637930,c_70b185780f10,8.1.5 Use dot (.) and cross (x) diagrams to il...,Video No. 1: Covalent Bonding,0,2
3119823,t_70da08637930,c_40b1fea5ad01,8.1.5 Use dot (.) and cross (x) diagrams to il...,More on the dot structure for sulfur dioxide,0,2
3119824,t_70da08637930,c_a73aa42d1be9,8.1.5 Use dot (.) and cross (x) diagrams to il...,Covalent bond,0,2
3119825,t_70da08637930,c_dbce33468856,8.1.5 Use dot (.) and cross (x) diagrams to il...,Diamagnetism,0,2


In [84]:
candidates_df.columns = ['topics_ids','content_ids','fold','target','title1','title2']

In [85]:
candidates_df[['topics_ids', 'content_ids', 'title1', 'title2', 'target', 'fold']].to_parquet('data/candidates_50_train_7840.parquet')

ALL DATA
------------------------------------------------------------------
#### NO TUNE
----------------
TOP 50
/kaggle/input/sbert-models/paraphrase-multilingual-MiniLM-L12-v2
Our max positive score is 0.41649
Our f2_score is 0.1007

----------------
TOP 50
/kaggle/input/sentence-embedding-models/paraphrase-MiniLM-L12-v2
Our max positive score is 0.44421
Our f2_score is 0.1099

----------------
TOP 50
/kaggle/input/sentence-embedding-models/paraphrase-mpnet-base-v2
Our max positive score is 0.45422
Our f2_score is 0.1133

----------------
TOP 50
/kaggle/input/sbert-models/paraphrase-multilingual-mpnet-base-v2
Our max positive score is 0.42578
Our f2_score is 0.1033

---------------
TOP 50
/kaggle/input/paraphrasemultilingualmpnetbasev2/all-MiniLM-L6-v2
Our max positive score is 0.47988
Our f2_score is 0.1216

------------------------------------------------------------------
#### TUNED
----------------
TOP 50
'/kaggle/input/paraphrase-multilingual-mpnet-base-v2-tuned/paraphrase-multilingual-mpnet-base-v2-exp_fold0_epochs8'
Our max positive score is 0.68706
Our f2_score is 0.1902

----------------
TOP 50
'/kaggle/input/stage-1-tuned/paraphrase-multilingual-mpnet-base-v2-tuned' ##15 epoch
Our max positive score is 0.72044
Our f2_score is 0.201

---------------
TOP 50
'/kaggle/input/all-minilm-l6-v2-tuned/all-MiniLM-L6-v2_fold0_epochs20/all-MiniLM-L6-v2_fold0_epochs20'
Our max positive score is 0.62932
Our f2_score is 0.1713

---------------
TOP 50
'/kaggle/input/all-minilm-l6-v2-tuned/all-MiniLM-L6-v2_fold0_epochs8/all-MiniLM-L6-v2_fold0_epochs8'
Our max positive score is 0.59703
Our f2_score is 0.1607

In [6]:
# Build training set
full_correlations = pd.read_csv('/kaggle/input/all-minilm-l6-v2-tuned/kfold_correlations.csv')
topics_full = topics.merge(full_correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
topics_full['predictions'] = topics_full.apply(lambda x: ' '.join(list(set(x.predictions.split(' ') + x.content_ids.split(' ')))) \
                                               if x.fold != 0 else x.predictions, axis = 1)
train = build_training_set(topics_full, content, CFG)
print(f'Our training set has {len(train)} rows')
# Save train set to disk to train on another notebook
train.to_csv(f'train_top{CFG.top_n}_fold0_cv_with_groundtruth_final_72044.csv', index = False)
train.head()

  0%|          | 0/61517 [00:00<?, ?it/s]

Our training set has 3119827 rows


Unnamed: 0,topics_ids,content_ids,title1,title2,target,fold
0,t_3d9ad9931021,c_8a2c8da77d0c,,Agenda,1,3
1,t_3d9ad9931021,c_3f51421a7c85,,ABCD,0,3
2,t_3d9ad9931021,c_db7818729577,,,0,3
3,t_3d9ad9931021,c_eb7d5e2e1744,,Simon,0,3
4,t_3d9ad9931021,c_60dd2fc8a271,,Ihab,0,3
