In [17]:
import ktrain
import numpy as np
from ktrain import text as txt

## Reading and preprocessing input

In [18]:
labels = {"SE":'1', "FT":'1', "PT":'0'}

train_docs = []
train_labels = []
for item in open("../data/datasets_bert/secondstage_ctx/train.tsv", 'r'):
    item_fields = item.strip().split('\t')
    train_docs.append((item_fields[1].strip(), item_fields[2].strip()))
    train_labels.append(labels[item_fields[3].strip()])

test_docs = []
test_labels = []
for item in open("../data/datasets_bert/secondstage_ctx/test.tsv", 'r'):
    item_fields = item.strip().split('\t')
    test_docs.append((item_fields[1].strip(), item_fields[2].strip()))
    test_labels.append(labels[item_fields[3].strip()])

In [None]:
# using last couple of conversations as validation set

valid_docs = train_docs[1502:]
valid_labels = train_labels[1502:]

train_docs = train_docs[:1502]
train_labels = train_labels[:1502]

print(len(train_docs))
print(len(train_labels))
print(train_docs[0])
print(train_labels[0])

print(len(valid_docs))
print(len(valid_labels))
print(valid_docs[0])
print(valid_labels[0])

## Create a new Transformer from a Hugging Face pretrained one!

In [None]:
# sentence pair classification
MODEL_NAME = 'nboost/pt-bert-base-uncased-msmarco'
t = txt.Transformer(MODEL_NAME, maxlen=500, class_names=['FT', 'PT'])
trn = t.preprocess_train(train_docs, train_labels)
val = t.preprocess_test(valid_docs, valid_labels)
test = t.preprocess_test(test_docs, test_labels)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)

## find a good learning rate

In [None]:
# find good learning rate
learner.lr_find()             # briefly simulate training to find good learning rate
learner.lr_plot()             # visually identify best learning rate

## train the classifier

In [None]:
# train using 1-cycle learning rate schedule for N epochs
learner.fit_onecycle(lr=1e-4, epochs=100, checkpoint_folder="pt-bert-base-uncased-msmarco-weights_ctx")

## reload saved weights and predict

In [None]:
# model can be downloaded here: http://hpc.isti.cnr.it/~nardini/adaptive-utterance/second_ctx_ft-vs-pt.hdf5
# $ wget http://hpc.isti.cnr.it/~nardini/adaptive-utterance/second_ctx_ft-vs-pt.hdf5

model.load_weights('second_ctx_ft-vs-pt.hdf5')
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.validate(val_data=test, class_names=["PT", "FT"])