In [36]:
!pip install cupy

[0m

In [37]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from transformers import AutoTokenizer, AutoModel
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors

from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
%env TOKENIZERS_PARALLELISM=true

import os
for dirname, _, filenames in os.walk('/kaggle/input/learning-equality-curriculum-recommendations'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

env: TOKENIZERS_PARALLELISM=true
/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv
/kaggle/input/learning-equality-curriculum-recommendations/topics.csv
/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv
/kaggle/input/learning-equality-curriculum-recommendations/content.csv


In [38]:
# imports
import transformers
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
from fuzzywuzzy import fuzz, process
import math

In [39]:
class CFG:
    INPUT = '/kaggle/input/learning-equality-curriculum-recommendations'
    MODEL = Path('/kaggle/input/sentencetransformers/model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/')
    TOKENIZER = Path('/kaggle/input/sentencetransformers/tokenizer/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/')
    MAX_LEN = 384
    SELECT_TOP_N = 10

In [40]:
content_df = pd.read_csv(f'{CFG.INPUT}/content.csv')
correlations_df = pd.read_csv(f'{CFG.INPUT}/correlations.csv')
topics_df = pd.read_csv(f'{CFG.INPUT}/topics.csv')
sub_df = pd.read_csv(f'{CFG.INPUT}/sample_submission.csv')

In [41]:
model = transformers.AutoModel.from_pretrained(CFG.MODEL)
model.eval()
model.to(device)


tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.TOKENIZER, use_fast=True)

In [42]:
# content_df = content_df.sample(n = 1000)

In [43]:
vecs = []
for _, row in tqdm(content_df.iterrows(), total=len(content_df)):
    title = row['title']
    if type(title) is float:
        title = row['description']
    if type(title) is float:
        title = row['text']
    
    tok = tokenizer(title)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    vecs.append(vec)
    
vecs1 = torch.stack(vecs)

  0%|          | 0/154047 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (13065 > 512). Running this sequence through the model will result in indexing errors


In [44]:
sub_topic_ids = sub_df['topic_id'].tolist()
_topics_df = topics_df.query(f'id in {sub_topic_ids}')

In [45]:
vecs = []
for _, row in tqdm(_topics_df.iterrows(), total=len(_topics_df)):
    title = row['title']
    if type(title) is float:
        title = row['description']
    if type(title) is float:
        title = "This content contains no text."
    
    tok = tokenizer(title)
    for k, v in tok.items():
        tok[k] = torch.tensor(v[:CFG.MAX_LEN]).to(device).unsqueeze(0)
    with torch.no_grad():
        output = model(**tok)
    vec = output.last_hidden_state.squeeze(0).mean(0).cpu()
    vecs.append(vec)
    
vecs2 = torch.stack(vecs)

  0%|          | 0/5 [00:00<?, ?it/s]

In [46]:
vecs1 = cp.asarray(vecs1)
vecs2 = cp.asarray(vecs2)

In [47]:
all_content_ids = content_df.id.to_numpy()
all_content_titles = content_df.title.to_numpy()
all_content_language = content_df.language.to_numpy()

all_test_ids = list(_topics_df.id)
all_test_title = list(_topics_df.title.to_numpy())
all_test_language = list(_topics_df.language.to_numpy())

In [58]:
model_knn = NearestNeighbors(metric='cosine', n_neighbors=1000)
model_knn.fit(vecs1)

predicts = []
for i, test_id in tqdm(enumerate(all_test_ids), total=len(all_test_ids)):
    v2 = vecs2[i].reshape(1,-1)
    topic_language = all_test_language[i]
    distance, indices = model_knn.kneighbors(v2, n_neighbors = 1000)
    distance = distance[0]
    indices = indices[0]
#     print(distance)
#     print(indices)
    topNPre = []
    for arrIdx in range(10):
        x = indices[arrIdx].get()
        if all_content_language[x] ==  topic_language and distance[arrIdx].get() < 0.25:
            topNPre.append(x)
            
    if len(topNPre) == 0:
        p = " ".join([all_content_ids[s] for s in indices[:1].get()])
    else:
        p = " ".join([all_content_ids[s] for s in topNPre[:10]])
    predicts.append(p)      

  0%|          | 0/5 [00:00<?, ?it/s]

[0.25220382 0.25984955 0.2618181  0.28578812 0.28662407 0.29010588
 0.292839   0.2941349  0.29450464 0.29585463 0.29640096 0.29825807
 0.29879862 0.300429   0.3004676  0.30297554 0.3044557  0.3046921
 0.30575782 0.30618334 0.30728912 0.30776834 0.3099181  0.31012803
 0.31258357 0.31308913 0.31550127 0.31553066 0.31596076 0.3162136
 0.31646425 0.31694287 0.31719506 0.31729233 0.31729996 0.31746686
 0.31820983 0.3189447  0.31897306 0.3198529  0.32010484 0.32019532
 0.32022572 0.3216411  0.32169217 0.32169217 0.32172012 0.32297105
 0.32348573 0.32364106 0.32383204 0.32391912 0.32484782 0.32508463
 0.32519966 0.32549107 0.32616466 0.32623667 0.32625937 0.32669038
 0.32675296 0.32692242 0.32755935 0.32755935 0.32766587 0.32820946
 0.3283335  0.32845575 0.32853472 0.3287313  0.32894236 0.32940012
 0.32956994 0.32956994 0.3298086  0.33022022 0.33063012 0.33063012
 0.3313632  0.3319915  0.33201838 0.33307153 0.33307153 0.3337329
 0.33393192 0.33393544 0.33393973 0.33405292 0.33414608 0.3341947

In [59]:
sub_df['content_ids'] = predicts
sub_df.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_df1a5b94e8e6
1,t_00068291e9a4,c_e88be716634d c_89ce9367be10
2,t_00069b63a70a,c_89dee4e1d740 c_430ddb6fbfa4 c_07e16f3aabbf
3,t_0006d41a73a8,c_fa21b549f383
4,t_4054df11a74e,c_3695c5dc1df6 c_80dfc28d7bf2 c_003a969a8dfc c...
