In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install sentencepiece
!pip install transformers==4.21.2
!pip install tokenizers==0.12.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.0-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0

In [3]:
!nvidia-smi

Mon Feb 27 21:10:28 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    51W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os

DIR = "/content/drive/MyDrive/Competitions/Kaggle/LECR"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")
OUTPUT_FINETUNE = os.path.join(OUTPUT_DIR,"FINETUNE")
OUPTUT_MODELS = os.path.join(OUTPUT_FINETUNE,"003_paraphrase-multilingual-mpnet-base-v2")

if not os.path.exists(OUTPUT_FINETUNE):
    os.makedirs(OUTPUT_FINETUNE)

if not os.path.exists(OUPTUT_MODELS):
    os.makedirs(OUPTUT_MODELS)

In [5]:
!pip -qqq install sentence-transformers

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [6]:
contents = pd.read_csv(os.path.join(INPUT_DIR,'content.csv'))
correlations = pd.read_csv(os.path.join(INPUT_DIR,'correlations.csv'))
topics = pd.read_csv(os.path.join(INPUT_DIR,'topics.csv'))

In [7]:
# =========================================================================================
# CV split
# =========================================================================================
def cv_split(train, n_folds, seed):
  kfold = KFold(n_splits = n_folds, shuffle = True, random_state = seed)
  for num, (train_index, val_index) in enumerate(kfold.split(train)):
      train.loc[val_index, 'fold'] = int(num)
  train['fold'] = train['fold'].astype(int)
  display(train.groupby('fold').size())
  return train

In [8]:
# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

In [9]:
topics_df = pd.read_csv(os.path.join(INPUT_DIR,"topics.csv"), index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv(os.path.join(INPUT_DIR,'content.csv'), index_col=0).fillna("")
correlations_df = pd.read_csv(os.path.join(INPUT_DIR,"correlations.csv"), index_col=0)

# an example topic that does not by itself provide much information about what content is relevant
#topic = Topic("t_fffe811a6da9")
#print("Content title:\t'" + topic.content[0].title + "' [kind: " + topic.content[0].kind + "]")
#print("Topic title:\t'" + topic.title + "'")
#print("Breadcrumbs:\t" + topic.get_breadcrumbs())

In [10]:
def get_content(id):
  topic = Topic(id)
  context = topic.get_breadcrumbs()
  return context

In [11]:
def get_parent_description(id):
  if id != 9999:
    parent_description = topics.loc[topics["id"]==id]["description"]
  else:
    parent_description = "This is Top Topic"
  return parent_description

In [12]:
topics['title'].fillna("Title does not exist", inplace = True)
contents['title'].fillna("Title does not exist", inplace = True)

topics['description'].fillna("Description does not exist", inplace = True)
contents['description'].fillna("Description does not exist", inplace = True)

contents['text'].fillna("Text does not exist", inplace = True)

topics["context"] = topics["id"].apply(get_content)
#topics["parent"].fillna(9999,inplace=True)
#topics["parent_description"] = topics["parent"].apply(get_parent_description)

contents["title"] = contents["title"] + "<|=t_sep=|>" + contents["description"] + "<|=t_sep=|>" + contents["text"]
topics["title"] = topics["title"] + "<|=t_sep=|>" + topics["description"] + topics["context"] #+ "<|=t_sep=|>" + topics["parent_description"]

In [13]:
kfolds = cv_split(correlations, 5, 42)
kfolds.to_csv(os.path.join(OUPTUT_MODELS,"Step1CorrelationsFold5.csv"),index=False)
correlations = kfolds[kfolds.fold != 0]
correlations

fold
0    12304
1    12304
2    12303
3    12303
4    12303
dtype: int64

Unnamed: 0,topic_id,content_ids,fold
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,1
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,1
2,t_00069b63a70a,c_11a1dc0bfb99,4
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...,2
5,t_0008a1bd84ba,c_7ff92a954a3d c_8790b074383e,3
...,...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87,1
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...,4
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5,2
61515,t_fffe14f1be1e,c_cece166bad6a,2


In [14]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
contents.rename(columns=lambda x: "content_" + x, inplace=True)

In [15]:
correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])
corr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  correlations["content_id"] = correlations["content_ids"].str.split(" ")


Unnamed: 0,topic_id,fold,content_id
0,t_00004da3a1b2,1,c_1108dd0c7a5d
0,t_00004da3a1b2,1,c_376c5a8eb028
0,t_00004da3a1b2,1,c_5bc0e1e2cba0
0,t_00004da3a1b2,1,c_76231f9d0b5e
1,t_00068291e9a4,1,c_639ea2ef9c95
...,...,...,...
61513,t_fff9e5407d13,4,c_d64037a72376
61514,t_fffbe1d5d43c,2,c_46f852a49c08
61514,t_fffbe1d5d43c,2,c_6659207b25d5
61515,t_fffe14f1be1e,2,c_cece166bad6a


In [16]:
corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(contents, how="left", on="content_id")
corr.head()

Unnamed: 0,topic_id,fold,content_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content,topic_context,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license
0,t_00004da3a1b2,1,c_1108dd0c7a5d,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...,Молив като резистор<|=t_sep=|>Моливът причиняв...,"Моливът причинява промяна в отклонението, подо...",video,Text does not exist,bg,,
1,t_00004da3a1b2,1,c_376c5a8eb028,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...,Да чуем променливото съпротивление<|=t_sep=|>Т...,Тук чертаем линия на лист хартия и я използвам...,video,Text does not exist,bg,,
2,t_00004da3a1b2,1,c_5bc0e1e2cba0,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...,Променлив резистор (реостат) с графит от молив...,Използваме сърцевината на молива (неговия граф...,video,Text does not exist,bg,,
3,t_00004da3a1b2,1,c_76231f9d0b5e,Откриването на резисторите<|=t_sep=|>Изследван...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Khan Academy (български език) >> Наука >> Физи...,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",video,Text does not exist,bg,,
4,t_00068291e9a4,1,c_639ea2ef9c95,Entradas e saídas de uma função<|=t_sep=|>Ente...,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Khan Academy (Português (Brasil)) >> Matemátic...,Dados e resultados de funções: gráficos<|=t_se...,Encontre todas as entradas que correspondem a ...,exercise,Text does not exist,pt,,


In [17]:
corr["set"] = corr[["topic_title", "content_title"]].values.tolist()
train_df = pd.DataFrame(corr["set"])

In [18]:
dataset = Dataset.from_pandas(train_df)

In [19]:
dataset

Dataset({
    features: ['set', '__index_level_0__'],
    num_rows: 223829
})

In [20]:
train_examples = []
train_data = dataset["set"]
n_examples = dataset.num_rows

for i in range(n_examples):
    example = train_data[i]
    if example[0] == None: #remove None
        print(example)
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])]))

In [21]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)

Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [22]:
batch_size = 64

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 20
#num_epochs = 1
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          save_best_model = True,
          output_path=os.path.join(OUPTUT_MODELS,'paraphrase-multilingual-mpnet-base-v2_fold0_epochs20'),
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3498 [00:00<?, ?it/s]

In [None]:
#model.save(os.path.join(OUTPUT_FINETUNE,"all-MiniLM-L6-v2-exp001"))

In [None]:
from google.colab import runtime
runtime.unassign()