## BERT

In [1]:
!pip install transformers datasets tokenizers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [30]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import math
import os
import tqdm
from pathlib import Path
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer

In [3]:
!wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip -qq /content/cornell_movie_dialogs_corpus.zip
!rm cornell_movie_dialogs_corpus.zip

--2024-07-22 15:02:55--  http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.53
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.53|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9916637 (9.5M) [application/zip]
Saving to: ‘cornell_movie_dialogs_corpus.zip’


2024-07-22 15:02:55 (25.3 MB/s) - ‘cornell_movie_dialogs_corpus.zip’ saved [9916637/9916637]



In [4]:
!mkdir datasets
!mv cornell\ movie-dialogs\ corpus/movie_conversations.txt ./datasets
!mv cornell\ movie-dialogs\ corpus/movie_lines.txt ./datasets

# Preprocess

In [17]:
with open("/content/datasets/movie_conversations.txt", "r", encoding = "iso-8859-1") as file:
  conversations = file.readlines()

with open("/content/datasets/movie_lines.txt", "r", encoding = "iso-8859-1") as file:
  lines = file.readlines()

In [18]:
"""
    Description:
    - Each line in lines has format: "L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n"
        -> Each line is the line number and its text line
    - Each conversation has format: "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n"
        -> Each conversation has format a conversation in a list by line number
"""
tag_to_text = {}
conversations_split = []
qa_pairs = []

for line in lines:
  line = line.split("+++$+++")
  tag_to_text[line[0].strip()] = ' '.join(line[-1].replace("\n", "").strip().split())

for conversation in conversations:
  conversation = conversation.split("+++$+++")[-1].replace("\n", "").strip()
  line_tags = conversation.strip("[]").split(",")
  conv_split = []

  for id, tag in enumerate(line_tags):
    # print(tag.strip().strip("'"))
    text = tag_to_text[tag.strip().strip("'")]
    conv_split.append(text)

    if id < len(line_tags) - 1:
      pair = []
      next_text = tag_to_text[line_tags[id + 1].strip().strip("'")]
      pair.append(' '.join(text.split()))
      pair.append(' '.join(next_text.split()))

      qa_pairs.append(pair)

  conversations_split.append(conv_split)

lines = [pair[0] for pair in qa_pairs]

## WordPiece Tokenization

In [20]:
!rm -rf /content/datasets/data

In [21]:
%cd /content/datasets
!mkdir data
%cd data

text_file = 0

for id, line in tqdm.tqdm(enumerate(lines), total = len(lines)):
  if id % 10000 == 0:
    batch_lines = [line]
    text_file += 1
  else:
    batch_lines.append(line)
    if (id + 1) % 10000 == 0:
      with open(f"text_{text_file}.txt", "w", encoding = "utf-8") as file:
        file.write("\n".join(batch_lines))

/content/datasets
/content/datasets/data


100%|██████████| 221616/221616 [00:00<00:00, 1114070.68it/s]


# Training Tokenizer

In [31]:
data_path = "/content/datasets/data"
paths = []
for file in os.listdir(data_path):
  paths.append(os.path.join(data_path, file))

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train(
    files=paths,
    vocab_size=30_000,
    min_frequency=5,
    limit_alphabet=1000,
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

%cd /content/datasets
os.mkdir('./tokenizer')
tokenizer.save_model('./tokenizer', 'bert')

/content/datasets


['./tokenizer/bert-vocab.txt']