In [6]:
%%capture
# install stl_text from source code
!git clone https://github.com/hudeven/text
!pip install --upgrade -e ./text;

# switch to this dir to import task
%cd text/examples/doc_classification


In [1]:
import os

import torch
from torch.optim import AdamW
from pytorch_lightning import Trainer
from stl_text.ops.utils.arrow import convert_csv_to_arrow
from stl_text.datamodule import DocClassificationDataModule
from stl_text.models import RobertaModel
from task import DocClassificationTask

## Convert training data to arrow format
Note: only required for the first time

In [2]:
data_path = "./glue_sst2_tiny"
for split in ("train.tsv", "valid.tsv", "test.tsv"):
    split_path = os.path.join(data_path, split)
    convert_csv_to_arrow(split_path)

converted to arrow and saved to ./glue_sst2_tiny/train
converted to arrow and saved to ./glue_sst2_tiny/valid
converted to arrow and saved to ./glue_sst2_tiny/test


## Setup data module
It converts text to tensors

In [8]:
datamodule = DocClassificationDataModule(data_path=data_path, batch_size=8, drop_last=True, load_from_cache_file=False)
datamodule.setup("fit")

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))












HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

## [Optional] Process data with DataFrame API
Similar to pandas dataframe. But it's faster and more scalable with apache-arrow backend

In [4]:
df = datamodule.datasets["train"]
print(df)
print(df[0])
print(df["label_id"])
print(df["label_id"][0])

Dataset(features: {'label': Value(dtype='string', id=None), 'label_id': Value(dtype='int64', id=None), 'seq_len': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), 'token_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}, num_rows: 20)
{'label_id': tensor(1), 'seq_len': tensor(9), 'token_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0])}
tensor([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])
tensor(1)


## Build an ML Task
Classify a document with RoBERTa model

In [5]:
model = RobertaModel(
    vocab_size=1000,
    embedding_dim=1000,
    num_attention_heads=1,
    num_encoder_layers=1,
    output_dropout=0.4,
    out_dim=2,
)
optimizer = AdamW(model.parameters(), lr=0.0001)
task = DocClassificationTask(
    datamodule=datamodule,
    model=model,
    optimizer=optimizer,
)

## Train model

In [6]:
trainer = Trainer(max_epochs=5, fast_dev_run=True)
trainer.fit(task, datamodule=datamodule)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 1 batch(es)

  | Name            | Type                | Params
--------------------------------------------------------
0 | text_transform  | WhitespaceTokenizer | 0     
1 | label_transform | LabelTransform      | 0     
2 | model           | RobertaModel        | 13.5 M
3 | loss            | CrossEntropyLoss    | 0     
4 | valid_acc       | Accuracy            | 0     
5 | test_acc        | Accuracy            | 0     
--------------------------------------------------------
13.5 M    Trainable params
0         Non-trainable params
13.5 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1

## Test model with evaluation dataset

In [12]:
trainer.test(task, datamodule=datamodule)



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.5000), 'test_loss': tensor(2.0309)}
--------------------------------------------------------------------------------


[{'test_loss': 2.030856132507324, 'test_acc': 0.5}]

## Export model

In [13]:
export_path = "/tmp/doc_classification_task.pt1"
jit_module = task.to_torchscript(export_path)

## Deploy the model and start to inference

In [14]:
with open(export_path, "rb") as f:
    jit_module = torch.jit.load(f)

predictions = jit_module(text_batch=["hello world", "hi", "attention is all your need!"])
print(f"prediction result: {predictions}")

prediction result: ['1', '1', '1']
