In [1]:
pip install datasets transformers[sentencepiece]

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np 
import pandas as pd 
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from tqdm.auto import tqdm
from datasets import Dataset
import tensorflow as tf
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors

In [3]:
content_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/content.csv")
correlations_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")
topics_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/topics.csv")
submission_df = pd.read_csv("/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv")

In [4]:
model_path_or_name = '../input/transformers/bert-base-uncased'

# instantiate model & tokenizer
model     = AutoModel.from_pretrained(model_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(model_path_or_name)

# make sure to import the hugging face models using the add data button in your notebook. If you don't and you put your
# notebook into offline mode this cell will not be able to load a model. 

Some weights of the model checkpoint at ../input/transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer.is_fast

True

In [6]:
content_df_v2 = content_df.drop(columns = ["kind", 'id', 'language', 'copyright_holder', 'license'])

In [7]:
content_df_v2.head()

Unnamed: 0,title,description,text
0,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",
1,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,
2,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,
3,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,\nNado de aproximação\nSaber nadar nas ondas ...
4,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,Estándares Comunes del Estado de Nueva York\n\...


In [8]:
for _, row in tqdm(content_df_v2.iterrows(), total=len(content_df_v2)):
# goes through each row in the df and sets the varible 'title' = the value for that row. If value is missing 
# it assigns the value to be = the description or text column values.  
    title = row['title']
    if type(title) is float:
        title = row['description']
        row['title'] = title
    if type(title) is float:
        title = row['text']
        row['title'] = title

  0%|          | 0/154047 [00:00<?, ?it/s]

In [9]:
content_df_v2 = content_df_v2.drop(columns = ['description', 'text'])
content_df_v2.head()

Unnamed: 0,title
0,"Sumar números de varios dígitos: 48,029+233,930"
1,Trovare i fattori di un numero
2,Sumar curvas de demanda
3,Nado de aproximação
4,geometry-m3-topic-a-overview.pdf


In [10]:
dataset = Dataset.from_pandas(content_df_v2)
# we need to convert the df to a dataset so we can feed the data into the tokenizer

In [11]:
dataset

Dataset({
    features: ['title'],
    num_rows: 154047
})

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["title"], max_length = 512,
        pad_to_max_length = True,
        truncation = True)

# Helper function to call the tokenizer on our dataset

In [13]:
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["title"]
)

# call the tokenizer on the content data

     

#0:   0%|          | 0/39 [00:00<?, ?ba/s]



 

#1:   0%|          | 0/39 [00:00<?, ?ba/s]



 

#2:   0%|          | 0/39 [00:00<?, ?ba/s]



 

#3:   0%|          | 0/39 [00:00<?, ?ba/s]



In [14]:
vec = []
for k in tokenized_datasets['input_ids']:
    art = k 
    vec.append(art)
    
# stores the tokens in a list

In [15]:
vec = tf.stack(vec)

# reshape the list into a tf tensor so we can process it later. 

2023-01-16 23:19:30.111560: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [16]:
vec

<tf.Tensor: shape=(154047, 512), dtype=int32, numpy=
array([[  101,  7680,  2906, ...,     0,     0,     0],
       [  101, 19817,  7103, ...,     0,     0,     0],
       [  101,  7680,  2906, ...,     0,     0,     0],
       ...,
       [  101, 17174,  8566, ...,     0,     0,     0],
       [  101,  7842,  1997, ...,     0,     0,     0],
       [  101,  1996, 14855, ...,     0,     0,     0]], dtype=int32)>

In [17]:
sub_topic_ids = submission_df['topic_id'].tolist()
_topics_df = topics_df.query(f'id in {sub_topic_ids}')
# code that allows us to only select topics in the submissions_df, we will only be comparing the topics in the 
# submissions_df to the content_df. 

In [18]:
_topics_df

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
19136,t_4054df11a74e,Flow Charts: Logical Thinking?,This lesson is focused on flow charts. It supp...,6e3ba4,source,2,en,t_acbbd893e6af,True


In [19]:
topics_df_v2 = _topics_df.drop(columns = ['id', 'channel', 'category', 'level', 'language', 'parent', 'has_content'])

In [20]:
topics_df_v2

Unnamed: 0,title,description
0,Откриването на резисторите,"Изследване на материали, които предизвикват на..."
2,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.
3,Transcripts,
4,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...
19136,Flow Charts: Logical Thinking?,This lesson is focused on flow charts. It supp...


In [21]:
for _, row in tqdm(topics_df_v2.iterrows(), total=len(topics_df_v2)):
# goes through each row in the df and sets the varible 'title' = the value for that row. If value is missing 
# it assigns the value to be = the description or text values.  
    title = row['title']
    if type(title) is float:
        title = row['description']
        row['title'] = title
    if type(title) is float:
        title = 'has no title or description'
        row['title'] = title

  0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
topics_df_v3 = topics_df_v2.drop(columns = ['description'])

In [23]:
topics_df_v3

Unnamed: 0,title
0,Откриването на резисторите
2,Entradas e saídas de uma função
3,Transcripts
4,Графики на експоненциални функции (Алгебра 2 н...
19136,Flow Charts: Logical Thinking?


In [24]:
dataset = Dataset.from_pandas(topics_df_v3)

In [25]:
dataset

Dataset({
    features: ['title', '__index_level_0__'],
    num_rows: 5
})

In [26]:
tokenized_topic_datasets = dataset.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["title"]
)

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]



#3:   0%|          | 0/1 [00:00<?, ?ba/s]



In [27]:
tokenized_topic_datasets

Dataset({
    features: ['__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5
})

In [28]:
vec2 = []
for k in tokenized_topic_datasets['input_ids']:
    art = k 
    vec2.append(art)

In [29]:
vec2 = tf.stack(vec2)

In [30]:
vec2

<tf.Tensor: shape=(5, 512), dtype=int32, numpy=
array([[  101,  1193, 22919, ...,     0,     0,     0],
       [  101,  4372,  6494, ...,     0,     0,     0],
       [  101, 24051,  2015, ...,     0,     0,     0],
       [  101,  1183, 16856, ...,     0,     0,     0],
       [  101,  4834,  6093, ...,     0,     0,     0]], dtype=int32)>

In [31]:
vec_float = tf.cast(vec, dtype=tf.float32)
vec2_float = tf.cast(vec2, dtype=tf.float32)

# you have to convert your tf int32 tensors to float32 because the cos_sim loss function won't accept int32.

In [32]:
neigh = NearestNeighbors(n_neighbors=5, metric = 'cosine')
neigh.fit(vec_float)
indx = neigh.kneighbors(vec2_float, return_distance = False)



In [33]:
indx

array([[ 69135, 147829, 134340, 125815,   9346],
       [ 29523, 138349, 122346, 139450,  28761],
       [ 37399,  13647,  65007,  71461, 128485],
       [ 26539,   5623, 130639,  98492,  91769],
       [127504,  98104, 118057,  80517,  85147]])

In [34]:
predictions = []
for k in range(len(indx)):
    pred = indx[k]
    p = ' '.join([content_df.loc[ind, 'id'] for ind in pred])
    predictions.append(p)
submission_df['content_ids'] = predictions
submission_df.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_73169f863b30 c_f58a3e74d892 c_df0c885a91a9 c...
1,t_00068291e9a4,c_310140c79d3f c_e5c91d496d7a c_cb2e0f08e31f c...
2,t_00069b63a70a,c_3e1010cf775b c_16b3f3cf5a90 c_6c04a87db420 c...
3,t_0006d41a73a8,c_2bf407c7cc0d c_0981cea17a12 c_d8fd3d49b187 c...
4,t_4054df11a74e,c_d3ce47192e30 c_a2e474bc7974 c_c3e8425d2455 c...


In [35]:
submission_df.to_csv('submission.csv', index=None)