In [1]:
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split

model_name = "bert-base-uncased"
max_length = 512  # max sequence length for each document/sentence sample
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [2]:
import json
data_path = './ABSA/Clothing/data_with_clusters.json'
cloth_reviews = []
with open(data_path, "r") as f:
    for line in f:
        cloth_reviews.append(json.loads(line))

In [3]:
import os
def load_index(index_dir):
    assert os.path.exists(index_dir)
    with open(os.path.join(index_dir, 'train.index'), 'r') as f:
        train_index = [int(x) for x in f.readline().split(' ')]
    with open(os.path.join(index_dir, 'validation.index'), 'r') as f:
        valid_index = [int(x) for x in f.readline().split(' ')]
    with open(os.path.join(index_dir, 'test.index'), 'r') as f:
        test_index = [int(x) for x in f.readline().split(' ')]
    return train_index, valid_index, test_index

In [10]:
def load_sentence(sentence_dir):
    gen_train, gen_valid, gen_test = [], [], []
    with open(os.path.join(sentence_dir, 'generated-train.txt'), 'r') as f:
        lines = f.readlines()
        for i in range(len(lines)//4):
            gen_train.append(lines[i*4+2].strip())
    with open(os.path.join(sentence_dir, 'generated-validation.txt'), 'r') as f:
        lines = f.readlines()
        for i in range(len(lines)//4):
            gen_valid.append(lines[i*4+2].strip())
    with open(os.path.join(sentence_dir, 'generated-test.txt'), 'r') as f:
        lines = f.readlines()
        for i in range(len(lines)//4):
            gen_test.append(lines[i*4+2].strip())
    return gen_train, gen_valid, gen_test

In [13]:
train_index, valid_index, test_index = load_index("/remote-home/jianghaitian/stage1/Clothing/1/")
gen_train, gen_valid, gen_test = load_sentence("save/Clothing/")
gen_train_lab = [cloth_reviews[i]['rating'] for i in train_index]
gen_valid_lab = [cloth_reviews[i]['rating'] for i in valid_index]
gen_test_lab = [cloth_reviews[i]['rating'] for i in test_index]

gen_train_enc = tokenizer(gen_train, truncation=True, padding=True, max_length=max_length)
gen_valid_enc = tokenizer(gen_valid, truncation=True, padding=True, max_length=max_length)
gen_test_enc = tokenizer(gen_test, truncation=True, padding=True, max_length=max_length)

In [3]:
# gest_path = './ABSA/GEST/data_with_clusters.json'
# gest_reviews = []
# with open(data_path, "r") as f:
#     for line in f:
#         gest_reviews.append(json.loads(line))

In [4]:
# prepare dataset
random.shuffle(cloth_reviews)

# len(cloth_reviews)=347959, 6:2:2
train_texts = [' '.join(d['all_tokens']) for d in cloth_reviews[:210000]]
train_labels = [d['rating'] for d in cloth_reviews[:210000]]

valid_texts = [' '.join(d['all_tokens']) for d in cloth_reviews[210000:280000]]
valid_labels = [d['rating'] for d in cloth_reviews[210000:280000]]

test_texts = [' '.join(d['all_tokens']) for d in cloth_reviews[280000:]]
test_labels = [d['rating'] for d in cloth_reviews[280000:]]

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

In [5]:
# gest_texts = [' '.join(d['all_tokens']) for d in gest_reviews]
# gest_labels = [d['rating'] for d in gest_reviews]
# gest_encodings = tokenizer(gest_texts, truncation=True, padding=True, max_length=max_length)

In [5]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)
test_dataset = NewsGroupsDataset(test_encodings, test_labels)

In [14]:
gen_train_dtst = NewsGroupsDataset(gen_train_enc, gen_train_lab)
gen_valid_dtst = NewsGroupsDataset(gen_valid_enc, gen_valid_lab)
gen_test_dtst = NewsGroupsDataset(gen_test_enc, gen_test_lab)

In [6]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1).to("cuda")
# transformers.models.bert.modeling_bert.BertForSequenceClassification

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error  # accuracy is just micro F-1
def compute_metrics(pred):
    labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    # acc = accuracy_score(labels, preds)
    mse = mean_squared_error(labels, pred.predictions)
    mae = mean_absolute_error(labels, pred.predictions)
    return {'mse': mse, 'mae': mae}

In [8]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    warmup_steps=10,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training
    logging_steps=128,                # log weights each `logging_steps`
    eval_steps=128,                   # evaluate each `eval_steps`
    save_steps=128,                   # save weights each `save_steps`
    evaluation_strategy="steps",
)

In [9]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
# transformers.trainer.Trainer

In [11]:
trainer.train()

***** Running training *****
  Num examples = 210000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 411


Step,Training Loss,Validation Loss,Mse,Mae
128,2.1131,0.452394,0.452463,0.412987
256,0.4126,0.341692,0.34179,0.36253
384,0.3524,0.329813,0.329936,0.354856


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-128
Configuration saved in ./results/checkpoint-128/config.json
Model weights saved in ./results/checkpoint-128/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 70000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-256
Configuration saved in ./results/checkpoint-256/config.json
Model weights saved in ./results/checkpoint-256/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 70000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-384
Configuration saved in ./results/checkpoint-384/config.json
Model weights saved in ./results/checkpoint-384/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-384 (score: 0.32981258630752563).


TrainOutput(global_step=411, training_loss=0.9186097258778965, metrics={'train_runtime': 491.263, 'train_samples_per_second': 427.47, 'train_steps_per_second': 0.837, 'total_flos': 8201591289360000.0, 'train_loss': 0.9186097258778965, 'epoch': 1.0})

In [16]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 67959
  Batch size = 128


{'eval_loss': 0.33447501063346863,
 'eval_mse': 0.3345468044281006,
 'eval_mae': 0.35623660683631897,
 'eval_runtime': 47.1609,
 'eval_samples_per_second': 1441.003,
 'eval_steps_per_second': 2.82,
 'epoch': 1.0}

In [19]:
trainer.evaluate(train_dataset)

***** Running Evaluation *****
  Num examples = 210000
  Batch size = 128


{'eval_loss': 0.29461365938186646,
 'eval_mse': 0.29458436369895935,
 'eval_mae': 0.3378411531448364,
 'eval_runtime': 149.7391,
 'eval_samples_per_second': 1402.439,
 'eval_steps_per_second': 2.745,
 'epoch': 1.0}

In [15]:
trainer.evaluate(gen_train_dtst)

***** Running Evaluation *****
  Num examples = 231568
  Batch size = 128


{'eval_loss': 1.886029601097107,
 'eval_mse': 1.8863235712051392,
 'eval_mae': 0.928865909576416,
 'eval_runtime': 143.0383,
 'eval_samples_per_second': 1618.923,
 'eval_steps_per_second': 3.167,
 'epoch': 1.0}

In [17]:
trainer.evaluate(gen_valid_dtst)

***** Running Evaluation *****
  Num examples = 28946
  Batch size = 128


{'eval_loss': 1.889078140258789,
 'eval_mse': 1.8897786140441895,
 'eval_mae': 0.9307969212532043,
 'eval_runtime': 17.0262,
 'eval_samples_per_second': 1700.084,
 'eval_steps_per_second': 3.348,
 'epoch': 1.0}

In [18]:
trainer.evaluate(gen_test_dtst)

***** Running Evaluation *****
  Num examples = 28949
  Batch size = 128


{'eval_loss': 1.8891222476959229,
 'eval_mse': 1.8881690502166748,
 'eval_mae': 0.9272676706314087,
 'eval_runtime': 16.6437,
 'eval_samples_per_second': 1739.338,
 'eval_steps_per_second': 3.425,
 'epoch': 1.0}

{'eval_loss': 0.6918574571609497,
 'eval_accuracy': 0.7249571428571429,
 'eval_mse': 0.5315071428571428,
 'eval_mae': 0.34575,
 'eval_runtime': 163.0885,
 'eval_samples_per_second': 858.43,
 'eval_steps_per_second': 3.354,
 'epoch': 1.0}

In [11]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to the model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return probs.argmax()

In [12]:
# accuracy on test set.
correct = 0
se = 0
ae = 0
for i in range(len(test_texts)):
    l_pred = get_prediction(test_texts[i])
    correct += int(l_pred == test_labels[i])
    se += (l_pred - test_labels[i]) ** 2
    ae += abs(l_pred - test_labels[i])
accuracy = correct / len(test_labels)
mse = se / len(test_labels)
mae = ae / len(test_labels)
print(f"acc on test set is {accuracy*100:.2f}%.")
print(f"mse on test set is {mse}.")
print(f"mae on test set is {mae}.")

Here we can see the accuracy on test set is nearly the same as that on the validation test. So we can conclude that using the transformer encoder model for encoding the document and employing supervised learning classification metrics on this dataset can achieve a way better result.