In [None]:
from google.colab import drive 
drive.mount('/mntDrive')

In [None]:
!pip install transformers datasets

In [None]:
!ls /mntDrive/MyDrive/

In [None]:
from pathlib import Path

out_dir = Path('/mntDrive/MyDrive/icdar-dataset-20220207')

#out_dir = Path('icdar-dataset-20220207')

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv(out_dir/'task2_train.csv', index_col=0)
val = pd.read_csv(out_dir/'task2_val.csv', index_col=0)
test = pd.read_csv(out_dir/'task2_test.csv', index_col=0)

train = train.fillna('')
val = val.fillna('')
test = test.fillna('')

In [None]:
train.to_json(out_dir/'task2_train.jsonl', orient='records', lines=True)
val.to_json(out_dir/'task2_val.jsonl', orient='records', lines=True)
test.to_json(out_dir/'task2_test.jsonl', orient='records', lines=True)

In [None]:
from datasets import load_dataset

data_files = {'train': str(out_dir/'task2_train.jsonl'),
              'val': str(out_dir/'task2_val.jsonl'),
              'test': str(out_dir/'task2_test.jsonl')}

icdar_dataset = load_dataset("json", data_files=data_files)

In [None]:
icdar_dataset['train'][6352]

In [None]:
model_dir = '/mntDrive/MyDrive/results-0.3-20220207-no-checkpoints'
#model_dir = '/Users/janneke/models/results-0.3-20220207'
model_name = 'bert-base-multilingual-cased'

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer(icdar_dataset['train'][6352]['ocr'])

In [None]:
tokenized_icdar = icdar_dataset.map(lambda sample: tokenizer(sample['ocr'], truncation=True), batched=True)

In [None]:
tokenized_icdar = tokenized_icdar.remove_columns(['gs', 'ocr'])

In [None]:
Path(model_dir).is_dir()

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
from transformers import BertModel, AutoModel, AutoModelForTokenClassification, AutoModelWithLMHead, AutoModelForPreTraining, Trainer, TrainingArguments

model = BertModel.from_pretrained(model_dir)

In [None]:
model.eval();

In [None]:
model = model.to(device=device)

In [None]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer)

In [None]:
import json
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader


def create_intermediary_data(dataset, batch_size=8):
    dataloader = DataLoader(
        dataset, batch_size=batch_size, collate_fn=collator
    )

    out_path = out_dir/'task1_output'/str(dataset.split)
    out_path.mkdir(exist_ok=True, parents=True)

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            #print(i)
            batch.to(device=device)
            #print(batch)
            
            output = model(**batch)
            #print(output['pooler_output'].size())

            samples = output['pooler_output'].detach().cpu()
            out_file = out_path/f'task2_task1_output_{i}.pt'
            torch.save(samples, out_file)
            # with open(out_path/out_file, 'w') as f:
            #   for sample in samples:
            #       f.write(json.dumps({'task1_output': sample.tolist()}))
            #       f.write('\n')
            del samples
            del output
            del batch
            torch.cuda.empty_cache()

for split_name in ('test',):
    create_intermediary_data(tokenized_icdar[split_name], batch_size=128)