In [25]:
# =========================================================================================
# Libraries
# =========================================================================================
import numpy as np
import os
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
#import cupy as cp
from sklearn.model_selection import KFold
from sentence_transformers import SentenceTransformer
from unidecode import unidecode
import re
from annoy import AnnoyIndex
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# =========================================================================================
# Configurations
# =========================================================================================

    
#class CFG1:
#    num_workers = 24
#    model = "model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned/"
#    tokenizer = AutoTokenizer.from_pretrained(model)
#    batch_size = 128
#    top_n = 50
#    seed = 42
#    used_columns = 'title'
#    max_length = 50
#    mode = 'train'
#    
#class CFG2:
#    num_workers = 24
#    model = "model/all-MiniLM-L6-v2-epochs-1-tuned/"
#    tokenizer = AutoTokenizer.from_pretrained(model)
#    batch_size = 128
#    top_n = 50
#    seed = 42
#    used_columns = 'title'
#    max_length = 150
#    mode = 'val'

class CFG3:
    num_workers = 24
    model = "model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned/"
    tokenizer = AutoTokenizer.from_pretrained(model)
    batch_size = 64
    top_n = 50
    seed = 42
    used_columns = 'title'
    max_length = 64
    mode = 'val'
    
CFG_list = [CFG3]
    
# =========================================================================================
# Data Loading
# =========================================================================================

def feature_engineering(topics,content):
    
    topics['title'].fillna("no title", inplace = True)
    content['title'].fillna("no title", inplace = True)
    topics['description'].fillna("no description", inplace = True)
    content['description'].fillna("no description", inplace = True)
    
    content['text'].fillna("no text", inplace = True)
    content['license'].fillna("no license", inplace = True)
    content['kind'].fillna("no kind", inplace = True)

    topics['title'] =  topics['title'] + '. Language_' + topics['language'] + ". Description: " + topics['description']
    content['title'] =  content['title'] +  '. Language_' + content['language'] + ". Description: " + content['description']
    return topics,content

def read_data(cfg):
    topics = pd.read_csv('data/topics.csv')
    #topics = topics[topics.category != 'source']
    content = pd.read_csv('data/content.csv')
    correlations = pd.read_csv('data/kfold_correlations.csv')
    if cfg.mode == 'val':
        correlations = correlations[correlations.fold == 0]
        
    topics = topics.merge(correlations, how = 'inner', left_on = 'id', right_on = 'topic_id')
    # Fillna titles 
    #topics = preprocess(topics,['title','description'])
    #content = preprocess(content,['title','description'])
    topics,content = feature_engineering(topics,content)
    
    #topics['title'] =  topics['title'] + '. Language_' + topics['language'] + " " + topics['description']
    #content['title'] =  content['title'] +  '. Language_' + content['language'] + " " + content['description']
    
    #topics['title'] =  topics['title'] + ' <|=t_sep=|> '  + topics['description'] + ' <|=t_sep=|> language ( ' + topics['language'] +' )'
    #content['title'] =  content['title'] + ' <|=t_sep=|> '  + content['description'] + ' <|=t_sep=|> ' + content['text'] + ' <|=t_sep=|> ' + content['kind'] + ' <|=t_sep=|> language ( ' + content['language'] + ' )'

    #topics['title'] =  topics['title'] +  ". Description: " + topics['description'] + ' Language_' + topics['language']
    #content['title'] =  content['title'] + ". Description: " + content['description'] + ' Language_' + content['language']  + ". Kind: "+content['kind'] 
    #best for now
    #topics['title'] =  topics['title'] + '. Language_' + topics['language']
    #content['title'] =  content['title'] + '. Language_' + content['language']
    
    
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].astype(str).apply(lambda x: len(x))
    content['length'] = content['title'].astype(str).apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    
    # Drop cols
    topics.drop(['description', 'channel', 'category', 'level', 'parent', 'has_content', 'length'], axis = 1, inplace = True)
    content.drop(['description', 'kind', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, content, correlations

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        max_length = cfg.max_length,
        truncation=True,
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

# =========================================================================================
# F2 Score 
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)
# ===========================================================================================


# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    title1 = []
    title2 = []
    targets = []
    folds = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_title = row['title']
        predictions = row['predictions'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        fold = row['fold']
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
            folds.append(fold)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1, 
         'title2': title2, 
         'target': targets,
         'fold' : folds}
    )
    # Release memory
    del topics_ids, content_ids, title1, title2, targets
    gc.collect()
    return train
    
# =========================================================================================
# Get neighbors
# =========================================================================================

def cv_split(train, n_folds, seed):
    kfold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train)):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

def white_spaces(x):
    return re.sub(' +', ' ', x)

def preprocess(df,columns):
    for col in columns:
        df[col] = df[col].fillna("")
        #df[col] = df[col].str.strip('123.!? \n\t')
        #df[col] = df[col].str[:100]
    return df

def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_embeds = get_embeddings(topics_loader, model, device)
    content_embeds = get_embeddings(content_loader, model, device)
    return topics_embeds,content_embeds

env: TOKENIZERS_PARALLELISM=false
cuda


In [23]:
def max_recall_calculator(CFG_list,top_k=50):
    topics_embeds_all = []
    content_embeds_all = []
    for _idx, CFG in enumerate(CFG_list):
        print(f'trying:{CFG.model}')
        topics, content, correlations = read_data(CFG)
        topics_embeds,content_embeds = get_neighbors(topics,content,CFG)
        if len(CFG_list) > 1:
            topics_embeds_all.append(topics_embeds)
            content_embeds_all.append(content_embeds)
    if len(CFG_list) > 1:
        topics_embeds = np.concatenate(topics_embeds_all,axis=1)
        content_embeds = np.concatenate(content_embeds_all,axis=1)
    print("Embedding Shapes:", topics_embeds.shape, content_embeds.shape)
    topics.rename(columns=lambda x: "topic_" + x, inplace=True)
    content.rename(columns=lambda x: "content_" + x, inplace=True)
    
    model = AnnoyIndex(topics_embeds.shape[1], 'angular')

    for idx in tqdm(range(content_embeds.shape[0])):
        model.add_item(idx, content_embeds[idx])
    print('Training Annoy Model...')
    model.build(300)
    print('Annoy Model, Done.')
    print(f'Finding Nearest {top_k} contents for every topic...')
    dfs = []
    for topic_idx in tqdm(range(topics_embeds.shape[0])):
        content_idx = model.get_nns_by_vector(topics_embeds[topic_idx],n=top_k)
        df_temp = content[content.index.isin(content_idx)]
        df_temp['topic_id'] = list(topics[topics.index.isin([topic_idx])].topic_id)[0]
        df_temp['topic_title'] = list(topics[topics.index.isin([topic_idx])].topic_title)[0]
        dfs.append(df_temp)
    candidates = pd.concat(dfs).reset_index(drop=True)
    aa = candidates.groupby(['topic_id'])['content_id'].agg(list).reset_index()
    aa['predictions'] = aa.content_id.apply(lambda x: ' '.join(x))
    aa.drop('content_id',axis=1,inplace=True)
    topics_test = aa.merge(correlations, how = 'inner', left_on = ['topic_id'], right_on = ['topic_id'])
    pos_score = get_pos_score(topics_test['content_ids'], topics_test['predictions'])
    print('Validation df shape:',topics_test.shape)
    print(f'Model:{CFG.model}')
    print(f'Our max positive score is {pos_score}')

    f_score = f2_score(topics_test['content_ids'], topics_test['predictions'])
    print(f'Our f2_score is {f_score}')
    
    return {'max_pos_score':pos_score,'f2_score':f_score,'model':CFG.model,'top_k':top_k}

In [24]:
result = max_recall_calculator(CFG_list)

trying:model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned/
 
--------------------------------------------------
topics.shape: (12304, 6)
content.shape: (154047, 3)
correlations.shape: (12304, 3)


  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

Embedding Shapes: (12304, 768) (154047, 768)


  0%|          | 0/154047 [00:00<?, ?it/s]

Training Annoy Model...
Annoy Model, Done.
Finding Nearest 50 contents for every topic...


  0%|          | 0/12304 [00:00<?, ?it/s]

Validation df shape: (12304, 4)
Model:model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned/
Our max positive score is 0.79927
Our f2_score is 0.2225


In [21]:
result

{'max_pos_score': 0.80108,
 'f2_score': 0.2233,
 'model': 'model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned/',
 'top_k': 50}

In [26]:
topics_embeds_all = []
content_embeds_all = []
for _idx, CFG in enumerate(CFG_list):
    print(f'trying:{CFG.model}')
    topics, content, correlations = read_data(CFG)
    topics_embeds,content_embeds = get_neighbors(topics,content,CFG)
    topics_embeds_all.append(topics_embeds)
    content_embeds_all.append(content_embeds)

trying:model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned/
 
--------------------------------------------------
topics.shape: (12304, 6)
content.shape: (154047, 3)
correlations.shape: (12304, 3)


  0%|          | 0/193 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

In [27]:
#topics_embeds = np.concatenate(topics_embeds_all,axis=1)
#content_embeds = np.concatenate(content_embeds_all,axis=1)

In [28]:
topics_embeds.shape, content_embeds.shape

((12304, 768), (154047, 768))

In [29]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [45]:
%%time

from annoy import AnnoyIndex
model = AnnoyIndex(topics_embeds.shape[1], 'angular')

for idx in tqdm(range(content_embeds.shape[0])):
    model.add_item(idx, content_embeds[idx])
    
model.build(1000)

  0%|          | 0/154047 [00:00<?, ?it/s]

CPU times: user 51min 7s, sys: 7.5 s, total: 51min 15s
Wall time: 2min 19s


True

In [46]:
dfs = []
for topic_idx in tqdm(range(topics_embeds.shape[0])):
    content_idx = model.get_nns_by_vector(topics_embeds[topic_idx],n=50)
    df_temp = content[content.index.isin(content_idx)]
    df_temp['topic_id'] = list(topics[topics.index.isin([topic_idx])].topic_id)[0]
    df_temp['topic_title'] = list(topics[topics.index.isin([topic_idx])].topic_title)[0]
    #df_temp['topic_language'] = list(topics[topics.index.isin([topic_idx])].topic_language)[0]
    #df_temp['is_language'] = (df_temp['topic_language'] == df_temp['content_language'])
    #df_temp = df_temp[(df_temp.is_language==True) | (df_temp.content_language=='en') | (df_temp.content_language=='es') | (df_temp.content_language=='fr') | (df_temp.content_language=='ar')]
    if df_temp.shape[0] == 0:
        break
    #df_temp = df_temp.head(50)
    #display(df_temp)
    dfs.append(df_temp)

  0%|          | 0/12304 [00:00<?, ?it/s]

In [47]:
candidates = pd.concat(dfs).reset_index(drop=True)

In [48]:
aa = candidates.groupby(['topic_id'])['content_id'].agg(list).reset_index()
aa['predictions'] = aa.content_id.apply(lambda x: ' '.join(x))
aa.drop('content_id',axis=1,inplace=True)
topics_test = aa.merge(correlations, how = 'inner', left_on = ['topic_id'], right_on = ['topic_id'])

In [49]:
pos_score = get_pos_score(topics_test['content_ids'], topics_test['predictions'])
print(f'Our max positive score is {pos_score}')

f_score = f2_score(topics_test['content_ids'], topics_test['predictions'])
print(f'Our f2_score is {f_score}')

#Validation
#Our max positive score is 0.78403
#Our f2_score is 0.2239

Our max positive score is 0.80299
Our f2_score is 0.2237


In [None]:
Our max positive score is 0.80093
Our f2_score is 0.2232

In [13]:
topics_test

Unnamed: 0,topic_id,predictions,content_ids,fold
0,t_00004da3a1b2,c_0feaaa5dc39d c_82eaf550b23b c_0262b16c8ecc c...,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,1
1,t_00068291e9a4,c_14860bbee722 c_a72612bc23cb c_5b58b9ccaff5 c...,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,1
2,t_00069b63a70a,c_fbb55ec5bb93 c_186fc761585b c_c4b6db8b5c7d c...,c_11a1dc0bfb99,4
3,t_0006d41a73a8,c_6cd02c88f340 c_beddc68789b7 c_cde9544b589e c...,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...,2
4,t_0008768bdee6,c_6485703d86a9 c_9e801e3f512e c_e496ba917d34 c...,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4,0
...,...,...,...,...
61512,t_fff830472691,c_28742eece061 c_ec5a875e22ab c_9f5618066a2a c...,c_61fb63326e5d c_8f224e321c87,1
61513,t_fff9e5407d13,c_a86539ee631f c_18529667fd21 c_90cc64d55d46 c...,c_026db653a269 c_0fb048a6412c c_20de77522603 c...,4
61514,t_fffbe1d5d43c,c_f9512ad7d2c0 c_ef287a34941d c_2b3c03aeb915 c...,c_46f852a49c08 c_6659207b25d5,2
61515,t_fffe14f1be1e,c_f15928e0f771 c_47a8fdc3f590 c_883c53c22054 c...,c_cece166bad6a,2


In [15]:
topics_test['predictions'] = topics_test.predictions.apply(lambda x: x.split(' '))
topics_test['content_ids'] = topics_test.content_ids.apply(lambda x: x.split(' '))
gt = topics_test[['topic_id','content_ids','fold']].explode('content_ids')
preds = topics_test[['topic_id','predictions','fold']].explode('predictions')
candidates_df = preds.merge(gt[['topic_id','content_ids']],how='left',left_on=['topic_id','predictions'], right_on=['topic_id','content_ids'])
candidates_df.loc[candidates_df.content_ids.isnull(),'target'] = 0
candidates_df['target'] = candidates_df.target.fillna(1)
candidates_df.drop('content_ids',axis=1,inplace=True)

In [17]:
candidates_df.topic_id.nunique()

61517

In [18]:
candidates_df = candidates_df.rename(columns={'predictions':'content_id'})

In [19]:
candidates_df = candidates_df.merge(topics[['topic_id','topic_title']],on='topic_id')
candidates_df = candidates_df.merge(content[['content_id','content_title']],on='content_id')

In [24]:
candidates_df.shape

(3075850, 6)

In [30]:
candidates_df

Unnamed: 0,topic_id,content_id,fold,target,topic_title,content_title
0,t_00004da3a1b2,c_0feaaa5dc39d,1,0.0,Откриването на резисторите. Language_bg. Descr...,Успоредно свързани резистори. Language_bg. Des...
1,t_09ad67f245fc,c_0feaaa5dc39d,1,0.0,Електрични заряди и електрично поле. Language_...,Успоредно свързани резистори. Language_bg. Des...
2,t_261fb7043ad1,c_0feaaa5dc39d,3,0.0,Електричен ток и електрично напрежение. Langua...,Успоредно свързани резистори. Language_bg. Des...
3,t_2b1b6dfd096b,c_0feaaa5dc39d,0,0.0,Електрично поле. Language_bg. Description: no ...,Успоредно свързани резистори. Language_bg. Des...
4,t_3a1f5ae9f991,c_0feaaa5dc39d,0,0.0,Вериги с кондензатори. Language_bg. Descriptio...,Успоредно свързани резистори. Language_bg. Des...
...,...,...,...,...,...,...
3075845,t_fea53cc2a5bb,c_c4c2b22ec356,2,1.0,Suma y Resta de Fracciones. Language_es. Descr...,Calcula Expresiones con Números Mixtos. Langua...
3075846,t_ff0a0977e1fc,c_b49da9b18f9e,3,1.0,يتعرف التمثيل البياني للدوال المثلثية (جـا س ،...,يتعرف التمثيل البياني للدوال المثلثية (جـا س ،...
3075847,t_fff9e5407d13,c_20de77522603,4,1.0,NA_U06 - El periódico. Language_es. Descriptio...,Resumen: El periódico. Language_es. Descriptio...
3075848,t_fff9e5407d13,c_d64037a72376,4,1.0,NA_U06 - El periódico. Language_es. Descriptio...,Introducción: El periódico. Language_es. Descr...


In [25]:
a = pd.read_csv('data/train_top50_fold0_cv_with_groundtruth_final_72044.csv')#.fold.unique()#.columns

In [32]:
candidates_df.columns = ['topics_ids','content_ids','fold','target','title1','title2']

In [33]:
candidates_df[['topics_ids', 'content_ids', 'title1', 'title2', 'target', 'fold']].to_parquet('data/candidates_50_train_79927.parquet')

ALL DATA
------------------------------------------------------------------
#### NO TUNE
----------------
TOP 50
/kaggle/input/sbert-models/paraphrase-multilingual-MiniLM-L12-v2
Our max positive score is 0.41649
Our f2_score is 0.1007

----------------
TOP 50
/kaggle/input/sentence-embedding-models/paraphrase-MiniLM-L12-v2
Our max positive score is 0.44421
Our f2_score is 0.1099

----------------
TOP 50
/kaggle/input/sentence-embedding-models/paraphrase-mpnet-base-v2
Our max positive score is 0.45422
Our f2_score is 0.1133

----------------
TOP 50
/kaggle/input/sbert-models/paraphrase-multilingual-mpnet-base-v2
Our max positive score is 0.42578
Our f2_score is 0.1033

---------------
TOP 50
/kaggle/input/paraphrasemultilingualmpnetbasev2/all-MiniLM-L6-v2
Our max positive score is 0.47988
Our f2_score is 0.1216

------------------------------------------------------------------
#### TUNED
----------------
TOP 50
'/kaggle/input/paraphrase-multilingual-mpnet-base-v2-tuned/paraphrase-multilingual-mpnet-base-v2-exp_fold0_epochs8'
Our max positive score is 0.68706
Our f2_score is 0.1902

----------------
TOP 50
'/kaggle/input/stage-1-tuned/paraphrase-multilingual-mpnet-base-v2-tuned' ##15 epoch
Our max positive score is 0.72044
Our f2_score is 0.201

---------------
TOP 50
'/kaggle/input/all-minilm-l6-v2-tuned/all-MiniLM-L6-v2_fold0_epochs20/all-MiniLM-L6-v2_fold0_epochs20'
Our max positive score is 0.62932
Our f2_score is 0.1713

---------------
TOP 50
'/kaggle/input/all-minilm-l6-v2-tuned/all-MiniLM-L6-v2_fold0_epochs8/all-MiniLM-L6-v2_fold0_epochs8'
Our max positive score is 0.59703
Our f2_score is 0.1607

In [6]:
# Build training set
full_correlations = pd.read_csv('/kaggle/input/all-minilm-l6-v2-tuned/kfold_correlations.csv')
topics_full = topics.merge(full_correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
topics_full['predictions'] = topics_full.apply(lambda x: ' '.join(list(set(x.predictions.split(' ') + x.content_ids.split(' ')))) \
                                               if x.fold != 0 else x.predictions, axis = 1)
train = build_training_set(topics_full, content, CFG)
print(f'Our training set has {len(train)} rows')
# Save train set to disk to train on another notebook
train.to_csv(f'train_top{CFG.top_n}_fold0_cv_with_groundtruth_final_72044.csv', index = False)
train.head()

  0%|          | 0/61517 [00:00<?, ?it/s]

Our training set has 3119827 rows


Unnamed: 0,topics_ids,content_ids,title1,title2,target,fold
0,t_3d9ad9931021,c_8a2c8da77d0c,,Agenda,1,3
1,t_3d9ad9931021,c_3f51421a7c85,,ABCD,0,3
2,t_3d9ad9931021,c_db7818729577,,,0,3
3,t_3d9ad9931021,c_eb7d5e2e1744,,Simon,0,3
4,t_3d9ad9931021,c_60dd2fc8a271,,Ihab,0,3
