In [1]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [2]:
# ====================================================
# CFG
# ====================================================
class CFG1:
    num_workers=4
    sup_path="/kaggle/input/fb4-exp005-stsb-xlm-r/EXP005/"
    sup_config_path='/kaggle/input/fb4-exp005-stsb-xlm-r/EXP005/config.pth'
    uns_model="/kaggle/input/fb4-stsb-xlm-r-multilingual-ft001/001_stsb-xlm-r-multilingual/stsb-xlm-r-multilingual_fold0_epochs20"
    sup_model="/kaggle/input/fb4-stsb-xlm-r-multilingual-ft001/001_stsb-xlm-r-multilingual/stsb-xlm-r-multilingual_fold0_epochs20"
    sup_model_name = "stsb-xlm-r-multilingural"
    gradient_checkpointing=False
    batch_size=32
    max_len = 512
    target_size=1
    target_cols='target'
    seed=42
    n_fold=5
    top_n=100
    trn_fold=[0]
    threshold=0.036000000000000004
    
class CFG2:
    num_workers=4
    sup_path="/kaggle/input/fb4-exp006-mpnet-base-v2/EXP006/"
    sup_config_path='/kaggle/input/fb4-exp006-mpnet-base-v2/EXP006/config.pth'
    uns_model="/kaggle/input/fb4-stsb-xlm-r-multilingual-ft001/001_stsb-xlm-r-multilingual/stsb-xlm-r-multilingual_fold0_epochs20"
    sup_model="/kaggle/input/fb4-multilingual-mpnet-base-v2-ft003/003_paraphrase-multilingual-mpnet-base-v2/paraphrase-multilingual-mpnet-base-v2_fold0_epochs20"
    sup_model_name = "multilingual-mpnet-base-v2"
    gradient_checkpointing=False
    batch_size=16
    max_len = 512
    target_size=1
    target_cols='target'
    seed=42
    n_fold=5
    top_n=30
    trn_fold=[0]
    threshold=0.030000000000000002

In [3]:
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

def get_predictions(x_val, val_predictions, correlations, cfg):
    x_val['predictions'] = np.where(val_predictions > cfg.threshold, 1, 0)
    x_val1 = x_val[x_val['predictions'] == 1]
    x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
    x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
    x_val1.columns = ['topic_id', 'predictions']
    x_val0 = pd.Series(x_val['topics_ids'].unique())
    x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
    x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
    x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
    #x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
    return x_val_r

def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

In [4]:
#internet on
#!pip install pickle5

#import pickle5
#correlations = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv')

#oof1_path = "/kaggle/input/fb4-exp006-mpnet-base-v2/EXP006/oof_df.pkl"
#oof2_path = "/kaggle/input/fb4-exp005-stsb-xlm-r/EXP005/oof_df.pkl"

#def get_oof(path):
#    with open(path, "rb") as f:
#         oof = pickle5.load(f)
#    return oof
#oof1 = get_oof(oof1_path)
#oof2 = get_oof(oof2_path)

In [5]:
#oof_df = pd.concat([oof1,oof2])
#best_score, best_threshold = get_best_threshold(oof_df, oof_df.pred.values, correlations)
#print(f'Our CV score is {best_score} using a threshold of {best_threshold}')

#Our CV score is 0.6206 using a threshold of 0.061
threshold = 0.061

In [6]:
# ====================================================
# tokenizer
# ====================================================
CFG1.sup_tokenizer = AutoTokenizer.from_pretrained(CFG1.sup_model)
CFG1.uns_tokenizer = AutoTokenizer.from_pretrained(CFG1.uns_model)
CFG2.sup_tokenizer = AutoTokenizer.from_pretrained(CFG2.sup_model)
CFG2.uns_tokenizer = AutoTokenizer.from_pretrained(CFG2.uns_model)

In [7]:
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG1)
    
    
def get_logger(filename='infrence'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

In [8]:
# define some helper functions and classes to aid with data traversal


class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs
    
def get_content(id):
    topic = Topic(id)
    context = topic.get_breadcrumbs()
    return context

topics_df = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/topics.csv', index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/content.csv', index_col=0).fillna("")

In [9]:
# =========================================================================================
# Data Loading
# =========================================================================================
def get_datas():
    topics = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/topics.csv')
    content = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/content.csv')
    sample_submission = pd.read_csv('/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv')
    # Merge topics with sample submission to only infer test topics
    topic_df = topics.merge(sample_submission, how = 'inner', left_on = 'id', right_on = 'topic_id')

    
    # Fillna titles
    topic_df['title'].fillna("Title does not exist", inplace = True)
    content['title'].fillna("Title does not exist", inplace = True)
    
    topic_df['description'].fillna("Description does not exist", inplace = True)
    content['description'].fillna("Description does not exist", inplace = True)
    
    content['text'].fillna("Text does not exist", inplace = True)
    
    topic_df["context"] = topic_df["id"].apply(get_content)
    
    content["title"] = content["title"] + "<|=t_sep=|>" + content["description"] + "<|=t_sep=|>" + content["text"]
    topic_df["title"] = topic_df["title"] + "<|=t_sep=|>" + topic_df["description"] + topic_df["context"]
    
    
    # Sort by title length to make inference faster
    topic_df['length'] = topic_df['title'].apply(lambda x: len(x))
    content['length'] = content['title'].apply(lambda x: len(x))
    topic_df.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    # Drop cols
    topic_df.drop(['description', 'channel', 'category', 'level', 'language', 'parent', 'has_content', 'length', 'topic_id', 'content_ids'], axis = 1, inplace = True)
    content.drop(['description', 'kind', 'language', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    # Reset index
    topic_df.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    #print(' ')
    #print('-' * 50)
    #print(f"topics.shape: {topics_df.shape}")
    #print(f"content.shape: {content_df.shape}")
    #display(topics_df.head(3))
    #display(content_df.head(3))
    return topic_df, content

In [10]:
# =========================================================================================
# Unsupervised dataset
# =========================================================================================
def prepare_uns_input(text, cfg):
    inputs = cfg.uns_tokenizer.encode_plus(
        text, 
        max_length = 128,
        truncation=True,
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_uns_input(self.texts[item], self.cfg)
        return inputs

In [11]:
# =========================================================================================
# Supervised dataset
# =========================================================================================
def prepare_sup_input(text, cfg):
    inputs = cfg.sup_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class sup_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_sup_input(self.texts[item], self.cfg)
        return inputs

In [12]:
# ====================================================
# Model
# ====================================================

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.uns_model)
        self.model = AutoModel.from_pretrained(cfg.uns_model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    

class custom_model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
            self.config.hidden_dropout = 0.0
            self.config.hidden_dropout_prob = 0.0
            self.config.attention_dropout = 0.0
            self.config.attention_probs_dropout_prob = 0.0
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        #self.layer_norm1 = nn.LayerNorm(self.config.hidden_size,eps=1e-5)
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        #self._init_weights(self.layer_norm1)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        #feature = self.layer_norm1(feature)
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [13]:
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

In [14]:
# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_socre(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [15]:
# =========================================================================================
# Build our inference set
# =========================================================================================
def build_inference_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    title1 = []
    title2 = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_title = row['title']
        predictions = row['predictions'].split(' ')
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
    # Build training dataset
    test = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1, 
         'title2': title2
        }
    )
    # Release memory
    del topics_ids, content_ids, title1, title2
    gc.collect()
    return test

In [16]:
# =========================================================================================
# Inference function loop
# =========================================================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total = len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
    predictions = np.concatenate(preds)
    return predictions

In [17]:
# =========================================================================================
# Get neighbors & Prediction
# =========================================================================================

def get_knn_prediction(topics,content, CFG):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, CFG)
    # Create content dataset
    content_dataset = uns_dataset(content, CFG)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
            topics_dataset, 
            batch_size = CFG.batch_size, 
            shuffle = False, 
            collate_fn = DataCollatorWithPadding(tokenizer = CFG.uns_tokenizer, padding = 'longest'),
            num_workers = CFG.num_workers, 
            pin_memory = True, 
            drop_last = False
    )
    content_loader = DataLoader(
            content_dataset, 
            batch_size = CFG.batch_size, 
            shuffle = False, 
            collate_fn = DataCollatorWithPadding(tokenizer = CFG.uns_tokenizer, padding = 'longest'),
            num_workers = CFG.num_workers, 
            pin_memory = True, 
            drop_last = False
    )

    # Create unsupervised model to extract embeddings
    model = uns_model(CFG)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds
    gc.collect()

    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = CFG.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        p = ' '.join([content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(p)
    topics['predictions'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    torch.cuda.empty_cache()

    # Set id as index for content
    content.set_index('id', inplace = True)
    
    # Build training set
    test = build_inference_set(topics, content, CFG)
    
    test['title1'].fillna("Title does not exist", inplace = True)
    test['title2'].fillna("Title does not exist", inplace = True)
    # Create feature column
    test['text'] = test['title1'] + '[SEP]' + test['title2']
    # Drop titles
    test.drop(['title1', 'title2'], axis = 1, inplace = True)
    # Sort so inference is faster
    test['length'] = test['text'].apply(lambda x: len(x))
    test.sort_values('length', inplace = True)
    test.drop(['length'], axis = 1, inplace = True)
    test.reset_index(drop = True, inplace = True)
    gc.collect()
    torch.cuda.empty_cache()
    
    return test


def get_test_prediction(test, CFG):
    
    # Create dataset and loader
    test_dataset = sup_dataset(test, CFG)
    test_loader = DataLoader(
            test_dataset, 
            batch_size = CFG.batch_size, 
            shuffle = False, 
            collate_fn = DataCollatorWithPadding(tokenizer = CFG.sup_tokenizer, padding = 'longest'),
            num_workers = CFG.num_workers, 
            pin_memory = True, 
            drop_last = False
        )
    predictions = []
    for fold in CFG.trn_fold:
        model = custom_model(CFG, config_path=CFG.sup_config_path, pretrained=False)
        state = torch.load(CFG.sup_path+f"{CFG.sup_model_name.replace('/', '-')}_fold{fold}_{CFG.seed}.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state, prediction
        
        gc.collect()
        torch.cuda.empty_cache()
    predictions = np.mean(predictions, axis=0)
    test["pred"] = predictions
    
    return test

In [18]:
topics_df, content_df = get_datas()
test_knn = get_knn_prediction(topics_df,content_df, CFG2)
test = get_test_prediction(test_knn, CFG2)
#topics_df1, content_df1 = get_datas()
#test_knn1 = get_knn_prediction(topics_df,content_df, CFG1)
#te_df1 = get_test_prediction(test_knn, CFG1)
#test = pd.concat([te_df1,te_df2])

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9628 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multipro

 
Training KNN model...


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
# Use threshold
test['predictions'] = np.where(test["pred"] > CFG2.threshold, 1, 0)
test1 = test[test['predictions'] == 1]
test1 = test1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
test1['content_ids'] = test1['content_ids'].apply(lambda x: ' '.join(x))
test1.columns = ['topic_id', 'content_ids']
test0 = pd.Series(test['topics_ids'].unique())
test0 = test0[~test0.isin(test1['topic_id'])]
test0 = pd.DataFrame({'topic_id': test0.values, 'content_ids': ""})
test_r = pd.concat([test1, test0], axis = 0, ignore_index = True)
test_r.to_csv('submission.csv', index = False)
test_r.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_a962934f0afd c_1108dd0c7a5d c_376c5a8eb028 c...
1,t_00068291e9a4,c_f6772f4ae97f c_ebb7fdf10a7e c_639ea2ef9c95 c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_5c7376307896 c_b972646631cb c_d7a0d7eaf799 c...
4,t_4054df11a74e,
