## Prepare data 

### Load dataset

In [None]:
from datasets import load_dataset, load_from_disk

dataset_carolina = load_dataset("carolina-c4ai/corpus-carolina")
dataset_brwac = load_from_disk("../data/brwac_dataset")

# Generate Dataset

In [None]:
# Create small sample
# sample = dataset['corpus'] #.train_test_split(test_size=100)['test']
# sample.save_to_disk('../data/carolina_sample_100')

In [None]:
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
for text in tqdm(dataset_carolina['corpus']['text']):
    with open('../data/corpus-carolina/carolina.txt', 'a', encoding="utf-8") as f:
        f.write(text + '\n')

In [None]:
for example in tqdm(dataset_brwac['train']):
    with open('../data/brwac_dataset/brwac.txt', 'a', encoding="utf-8") as f:
        f.write("\n".join(sum(example['text']['paragraphs'], [])))

In [None]:
for text in tqdm(dataset_carolina['corpus']['text']):
    with open('../data/carolina_brwac/carolina_brwac.txt', 'a', encoding="utf-8") as f:
        f.write(text + '\n')

for example in tqdm(dataset_brwac['train']):
    with open('../data/carolina_brwac/carolina_brwac.txt', 'a', encoding="utf-8") as f:
        f.write("\n".join(sum(example['text']['paragraphs'], [])))

### Training Tokenizer

In [None]:
import sentencepiece as spm
import os

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
#os.makedirs(tokenizer_path, exist_ok=True)
tokenizer_name = 'm'

#os.makedirs(tokenizer_path, exist_ok=True)

#spm.SentencePieceTrainer.train(input='../data/corpus-carolina/carolina.txt', model_prefix=f'{tokenizer_path}/{tokenizer_name}', vocab_size=50265)

In [4]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta
from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

In [5]:
tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
# p,t=deberta.load_vocab(vocab_path=tokenizer_path, vocab_type='spm', pretrained_id='deberta-v3-base')
# tokenizer=deberta.tokenizers[t](p)
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')

### Generate training data

### STREAM OPTION

In [6]:
# Caminhos dos arquivos de entrada e saída
_input = '../data/carolina_brwac/carolina_brwac.txt'
_output = '../data/carolina_brwac/carolina_brwac_tokenized_STREAM.txt'


In [2]:
%%time
_input_file = '../data/carolina_brwac/carolina_brwac.txt'
_output_path = '../data/carolina_brwac/full_data_v2/lines_tokenized/'
_train_test_path = '../data/carolina_brwac/full_data_v2/train_test/'

def get_total_lines(input_file):
    with open(input_file, 'r', encoding='utf-8') as rfs:
        total_lines = sum(1 for _ in rfs)
    return total_lines

total_lines = get_total_lines(_input_file)
print(total_lines)

143946923
CPU times: total: 1min 34s
Wall time: 1min 34s


In [20]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta
from DeBERTa.deberta.spm_tokenizer import SPMTokenizer

import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os

tqdm.pandas()
# Tokenize input path in stream and save filename as line
# _input_file = '../data/carolina_brwac/stream_dev/test_file.txt'
# _output_path = '../data/carolina_brwac/stream_dev/lines_tokenized/'
# _train_test_path = '../data/carolina_brwac/stream_dev/train_test/'

_input_file = '../data/carolina_brwac/carolina_brwac.txt'
_output_path = '../data/carolina_brwac/full_data_v2/lines_tokenized/'
_train_test_path = '../data/carolina_brwac/full_data_v2/train_test/'

tokenizer_path = '../models/tokenizer/deberta-pt-carolina/'
tokenizer = SPMTokenizer(f'{tokenizer_path}m.model')

max_seq_length = 512

os.makedirs(_output_path, exist_ok=True)
os.makedirs(_train_test_path, exist_ok=True)

def get_total_lines(input_file):
    with open(input_file, 'r', encoding='utf-8') as rfs:
        total_lines = sum(1 for _ in rfs)
    return total_lines

def tokenize_and_write_data_stream(input_file, _output_path, checkpoint=0, in_memory = False, chunks = 10000, max_seq_length=512, total_lines = None):
    print('------\nRun Config:\n')
    print('Input File: ', input_file)
    print('Output Path: ', _output_path)
    print('Checkpoint: ', checkpoint)
    print('In Memory: ', in_memory)
    print('Chunks: ', chunks)
    print('Max Seq Length: ', max_seq_length)
    print('------')

    print('Counting Lines...')
    if total_lines == None:
        total_lines = get_total_lines(input_file)

    with open(input_file, 'r', encoding='utf-8') as rfs:

        print('Moving to Checkpoint...')
        line_count = 0
        if checkpoint > 0:
            for _ in range(checkpoint):
                rfs.readline()
                line_count += 1

        num_lines = total_lines - line_count
        print('Number of Lines to Process (total_lines - checkpoint): ', num_lines)
        
        print('------')

        if in_memory:
            _range = range(chunks)
            progress_bar_general = tqdm(range(num_lines//chunks), desc='Processing Chunks', position=0, leave=True)

            # Process chunks of lines
            for _ in progress_bar_general:
                #print('Loading Lines chunk...')
                progress_bar = tqdm(_range, desc='Loading Lines', position=1, leave=False, mininterval=100)

                lines = []
                for _ in progress_bar:
                    lines.append(rfs.readline().strip())
                progress_bar.close()
                
                #print('Tokenizing Lines chunk ...')
                progress_bar = tqdm(lines, total=chunks, desc='Tokenizing Lines', position=1, leave=False)
                lines_tkn = []
                for line in progress_bar:
                    lines_tkn.append(' '.join(tokenizer.tokenize(line)) + '\n')
                progress_bar.close()

                #print('Writing Lines chunk...')
                progress_bar = tqdm(lines_tkn, total=chunks, desc='Writing Lines', position=1, leave=False)
                for line in progress_bar:
                    _output_file = _output_path + f'{line_count}.txt'
                    with open(_output_file, 'w+', encoding='utf-8') as wfs:
                        wfs.write(line)
                    line_count += 1
                progress_bar.close()

            return lines_tkn, line_count

        else:  
            print('Streaming Lines...')

            tkn = []
            progress_bar = tqdm(rfs, total=num_lines, desc='Processing Data')
            for line in progress_bar:
                _output_file = _output_path + f'{line_count}.txt'
                with open(_output_file, 'w+', encoding='utf-8') as wfs:
                    tokens = tokenizer.tokenize(line.strip())
                    wfs.write(' '.join(tokens) + '\n')
                line_count += 1
                progress_bar.update(1)
    
    return line_count

def generate_train_test_files(lines_tokenized_path, _train_test_path, line_count, max_seq_length=512):
    _train_idx, _test_idx = train_test_split(np.arange(0, line_count), test_size=0.05, random_state=42)
    train_idx, test_idx = {i: True for i in _train_idx}, {i: True for i in _test_idx}
    
    train_tokens, test_tokens = [], []
    train_counter, test_counter = 0, 0

    for i in tqdm(range(line_count)):
        _input_file = lines_tokenized_path + f'{i}.txt'
        # if index is a train index, add to the train_tokens list. else, add to the test_tokens list
        with open(_input_file, 'r', encoding='utf-8') as rfs:
            # remove \n from the end of the line and split tokens
            tokens = rfs.read()[:-1].split(' ')
            if i in train_idx:
                train_tokens.extend(tokens)
                train_counter += len(tokens)

                # if train_counter is greater than max_seq_length_2, write to file train_tokens[:510] and train_counter-510
                if train_counter > max_seq_length-2:
                    with open(_train_test_path + 'train.txt', 'a', encoding='utf-8') as wfs:
                        wfs.write(' '.join(train_tokens[:max_seq_length-2]) + '\n')
                    train_tokens = train_tokens[max_seq_length-2:]
                    train_counter -= (max_seq_length-2)

            elif i in test_idx:
                test_tokens.extend(tokens)
                test_counter += len(tokens)

                # if test_counter is greater than max_seq_length_2, write to file test_tokens[:510] and test_counter-51
                if test_counter > max_seq_length-2:
                    with open(_train_test_path + 'test.txt', 'a', encoding='utf-8') as wfs:
                        wfs.write(' '.join(test_tokens[:max_seq_length-2]) + '\n')
                    test_tokens = test_tokens[max_seq_length-2:]
                    test_counter -= (max_seq_length-2)

    # write the remaining tokens to the train and test files (because the loop validation, they cant  > max_seq_length-2)
    with open(_train_test_path + 'train.txt', 'a', encoding='utf-8') as wfs:
        wfs.write(' '.join(train_tokens))
    
    with open(_train_test_path + 'test.txt', 'a', encoding='utf-8') as wfs:
        wfs.write(' '.join(test_tokens))
        


line_count = tokenize_and_write_data_stream(_input_file, _output_path, checkpoint=0, in_memory = True, max_seq_length=max_seq_length, total_lines = 143946923, chunks=10000)
#generate_train_test_files(_output_path, _train_test_path, line_count, max_seq_length)

------
Run Config:

Input File:  ../data/carolina_brwac/carolina_brwac.txt
Output Path:  ../data/carolina_brwac/full_data_v2/lines_tokenized/
Checkpoint:  0
In Memory:  True
Chunks:  10000
Max Seq Length:  512
------
Counting Lines...
Moving to Checkpoint...
Number of Lines to Process (total_lines - checkpoint):  143946923
------


Processing Chunks:   0%|          | 5/14394 [01:03<50:57:44, 12.75s/it]


KeyboardInterrupt: 

In [38]:
# # count tokens by each line from _train_test_path
# with open(_train_test_path + 'train.txt', 'r', encoding='utf-8') as rfs:
#     for line in rfs:
#         print(f'Train tokens: {len(line.strip().split(" "))}')

Train tokens: 510
Train tokens: 510
Train tokens: 510
Train tokens: 510
Train tokens: 344


### OLD OPTION

In [None]:
from tqdm import tqdm

_input = '../data/carolina_brwac/carolina_brwac.txt'
_output_train = '../data/carolina_brwac/carolina_brwac_TRAIN.txt'
_output_test = '../data/carolina_brwac/carolina_brwac_TEST.txt'
max_seq_length=512

from sklearn.model_selection import train_test_split

with open(_input, 'r', encoding='utf-8') as rfs:
    train, test = train_test_split(rfs.readlines(), test_size=0.05)

In [20]:
from sklearn.model_selection import train_test_split

with open(_input, 'r', encoding='utf-8') as rfs:
    train, test = train_test_split(rfs.readlines(), test_size=0.05)

In [None]:
# write train data
with open(_output_train, 'w', encoding='utf-8') as wfs:
    for line in tqdm(train, desc='Processing'):
        wfs.write(f"{line}\n")


In [None]:
# write train data
with open(_output_test, 'w', encoding='utf-8') as wfs:
    for line in tqdm(test, desc='Processing'):
        wfs.write(f"{line}\n")

In [None]:
# write train data
with open(_output_train, 'w', encoding='utf-8') as wfs:
    for line in tqdm(train, desc='Processing'):
        wfs.write(f"{line}\n")

In [None]:
def tokenize_data(data, tokenizer):
    tokenized_data = []
    for text in tqdm(data['text'], desc='Tokenizing'):
        tokenized_data.extend(tokenizer.tokenize(text))    
    return tokenized_data


def write_tokenized_data(tokenized_data, output_file, max_seq_length=512):
    lines = 0
    with open(output_file, 'w', encoding = 'utf-8') as wfs:
        idx = 0
        while idx < len(tokenized_data):
            wfs.write(' '.join(tokenized_data[idx:idx+max_seq_length-2]) + '\n')
            idx += (max_seq_length - 2)
            lines += 1
    print(f'Wrote {lines} lines to {output_file}')

In [None]:
%%time

train_tokenized = tokenize_data(train, tokenizer)
test_tokenized = tokenize_data(test, tokenizer)
# valid_tokenized = tokenize_data(valid, tokenizer)

In [None]:
%%time

write_tokenized_data(train_tokenized, '../data/carolina_brwac/full_data/train.txt')
write_tokenized_data(test_tokenized, '../data/carolina_brwac/full_data/test.txt')
# write_tokenized_data(valid_tokenized, '../data/carolina_brwac/full_data/valid.txt')
# Write test as valid
write_tokenized_data(test_tokenized, '../data/carolina_brwac/full_data/valid.txt')

### Test Tokenizer

In [None]:
import sys
sys.path.append('../DeBERTa')

from DeBERTa import deberta

In [None]:
p,t=deberta.load_vocab(vocab_path=tokenizer_path, vocab_type='spm', pretrained_id='deberta-v3-base')
tokenizer=deberta.tokenizers[t](p)