In [2]:
import torch
import torch.nn as nn
import sentencepiece as spm
import pandas as pd

# https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ja

: 

In [26]:
splits = {'test': 'en-ja/test-00000-of-00001.parquet', 'train': 'en-ja/train-00000-of-00001.parquet', 'validation': 'en-ja/validation-00000-of-00001.parquet'}
test = pd.read_parquet("hf://datasets/Helsinki-NLP/opus-100/" + splits["test"])
train = pd.read_parquet("hf://datasets/Helsinki-NLP/opus-100/" + splits["train"])
val = pd.read_parquet("hf://datasets/Helsinki-NLP/opus-100/" + splits["validation"])

In [27]:
print(f'Train Size: {train.shape}')
print(f'Test Size: {test.shape}')
print(f'Validation Size: {val.shape}')


Train Size: (1000000, 1)
Test Size: (2000, 1)
Validation Size: (2000, 1)


In [28]:
train['english'] = train['translation'].apply(lambda x: x['en'])
train['japanese'] = train['translation'].apply(lambda x: x['ja'])

In [29]:
train.head()

Unnamed: 0,translation,english,japanese
0,"{'en': 'Yeah, Vincent Hanna.', 'ja': '- ラウール -...","Yeah, Vincent Hanna.",- ラウール - ラウールに ヴィンセント・ハンナだ
1,{'en': 'I'm being held in a basement. I've bee...,I'm being held in a basement. I've been abduct...,いま地下に居ます 他の2人と一緒に誘拐されたんです！
2,"{'en': 'It works!', 'ja': '動いたよ！'}",It works!,動いたよ！
3,{'en': 'I'm just trying to find out what happe...,I'm just trying to find out what happened here.,何があったか突き止めたい
4,"{'en': 'You okay?', 'ja': '無事か？'}",You okay?,無事か？


In [32]:
train.drop(columns=['translation'], axis=1)

Unnamed: 0,english,japanese
0,"Yeah, Vincent Hanna.",- ラウール - ラウールに ヴィンセント・ハンナだ
1,I'm being held in a basement. I've been abduct...,いま地下に居ます 他の2人と一緒に誘拐されたんです！
2,It works!,動いたよ！
3,I'm just trying to find out what happened here.,何があったか突き止めたい
4,You okay?,無事か？
...,...,...
999995,No?,そうか？
999996,Some stuff you leave there.,ある記憶は すぐ忘れられ、
999997,But as for those who believed and did righteou...,主は信仰して善行に動む者を（十分に）報奨される。だがアッラーは，不義を行う者を御好みにならない。
999998,And he has struck for Us a similitude and forg...,またかれは，われに準えるものを引合いに出して，自分の創造を忘れ，言う。「誰が，朽ち果てた骨を...


In [36]:
english_sentences = train['english'].tolist()

with open('english_text.txt', 'w', encoding='utf-8') as f:
    for sentence in english_sentences:
        f.write(sentence + '\n')


In [38]:
spm.SentencePieceTrainer.train(
    input='english_text.txt',
    model_prefix='en_tokenizer',
    vocab_size=32000,
    model_type='bpe',
    character_coverage=1.0

)

In [39]:
japanese_sentences = train['japanese'].tolist()

with open('japanese_text.txt', 'w', encoding='utf-8') as f:
    for sentence in japanese_sentences:
        f.write(sentence + '\n')

In [3]:
spm.SentencePieceTrainer.train(
    input='japanese_text.txt',
    model_prefix='ja_tokenizer',
    vocab_size=32000,
    model_type='bpe',
    character_coverage=1.0

)

In [None]:
sp_en = spm.SentencePieceProcessor()
sp_ja = spm.SentencePieceProcessor()

sp_en.load('en_tokenizer.model')
sp_ja.load('ja_tokenizer.model')

In [None]:

train['english_tokenized'] = train['english'].apply(lambda x: sp_en.encode(x, out_type=int))
train['japanese_tokenized'] = train['japanese'].apply(lambda x: sp_ja.encode(x, out_type=int))