In [1]:
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split

model_name = "bert-base-uncased"
max_length = 128  # max sequence length for each document/sentence sample
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [19]:
import json
data_path = './ABSA/Steam/data_with_clusters.json'
reviews = []
with open(data_path, "r") as f:
    for line in f:
        reviews.append(json.loads(line))

In [3]:
import os
def load_index(index_dir):
    assert os.path.exists(index_dir)
    with open(os.path.join(index_dir, 'train.index'), 'r') as f:
        train_index = [int(x) for x in f.readline().split(' ')]
    with open(os.path.join(index_dir, 'validation.index'), 'r') as f:
        valid_index = [int(x) for x in f.readline().split(' ')]
    with open(os.path.join(index_dir, 'test.index'), 'r') as f:
        test_index = [int(x) for x in f.readline().split(' ')]
    return train_index, valid_index, test_index

In [4]:
def load_sentence(sentence_dir):
    gen_train, gen_valid, gen_test = [], [], []
    with open(os.path.join(sentence_dir, 'generated-train.txt'), 'r') as f:
        lines = f.readlines()
        for i in range(len(lines)//4):
            gen_train.append(lines[i*4+2].strip())
    with open(os.path.join(sentence_dir, 'generated-validation.txt'), 'r') as f:
        lines = f.readlines()
        for i in range(len(lines)//4):
            gen_valid.append(lines[i*4+2].strip())
    with open(os.path.join(sentence_dir, 'generated-test.txt'), 'r') as f:
        lines = f.readlines()
        for i in range(len(lines)//4):
            gen_test.append(lines[i*4+2].strip())
    return gen_train, gen_valid, gen_test

In [5]:
train_index, valid_index, test_index = load_index("/remote-home/jianghaitian/stage1/Steam/1/")
gen_train, gen_valid, gen_test = load_sentence("save/Steam/")
gen_train_lab = [float(reviews[i]['rating']) for i in train_index]
gen_valid_lab = [float(reviews[i]['rating']) for i in valid_index]
gen_test_lab = [float(reviews[i]['rating']) for i in test_index]

gen_train_enc = tokenizer(gen_train, truncation=True, padding=True, max_length=max_length)
gen_valid_enc = tokenizer(gen_valid, truncation=True, padding=True, max_length=max_length)
gen_test_enc = tokenizer(gen_test, truncation=True, padding=True, max_length=max_length)

In [6]:
# prepare dataset
random.shuffle(reviews)

# len(reviews)=789032, 6.3:2:1.7
train_texts = [' '.join(d['all_tokens']) for d in reviews[:500000]]
train_labels = [float(d['rating']) for d in reviews[:500000]]

valid_texts = [' '.join(d['all_tokens']) for d in reviews[500000:650000]]
valid_labels = [float(d['rating']) for d in reviews[500000:650000]]

test_texts = [' '.join(d['all_tokens']) for d in reviews[650000:]]
test_labels = [float(d['rating']) for d in reviews[650000:]]

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

In [7]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)
test_dataset = NewsGroupsDataset(test_encodings, test_labels)

In [8]:
gen_train_dtst = NewsGroupsDataset(gen_train_enc, gen_train_lab)
gen_valid_dtst = NewsGroupsDataset(gen_valid_enc, gen_valid_lab)
gen_test_dtst = NewsGroupsDataset(gen_test_enc, gen_test_lab)

In [9]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1).to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error  # accuracy is just micro F-1
def compute_metrics(pred):
    labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    # acc = accuracy_score(labels, preds)
    mse = mean_squared_error(labels, pred.predictions)
    mae = mean_absolute_error(labels, pred.predictions)
    return {'mse': mse, 'mae': mae}

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    warmup_steps=10,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training
    logging_steps=128,                # log weights each `logging_steps`
    eval_steps=128,                   # evaluate each `eval_steps`
    save_steps=128,                   # save weights each `save_steps`
    evaluation_strategy="steps",
)

In [12]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 500000
  Num Epochs = 1
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 977


Step,Training Loss,Validation Loss,Mse,Mae
128,2.8234,0.003662,0.003662,0.060437
256,0.0116,0.014648,0.014648,0.120976
384,0.0085,0.006818,0.006818,0.082539
512,0.0071,0.009896,0.009896,0.099453
640,0.0065,0.010763,0.010763,0.103722
768,0.0061,0.010269,0.010269,0.101316
896,0.0058,0.011001,0.011001,0.10487


***** Running Evaluation *****
  Num examples = 150000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-128
Configuration saved in ./results/checkpoint-128/config.json
Model weights saved in ./results/checkpoint-128/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 150000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-256
Configuration saved in ./results/checkpoint-256/config.json
Model weights saved in ./results/checkpoint-256/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 150000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-384
Configuration saved in ./results/checkpoint-384/config.json
Model weights saved in ./results/checkpoint-384/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 150000
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-512
Configuration saved in ./results/checkpoint-512/config.json
Model weights saved in ./results/checkpoint-512/pytorch_mo

TrainOutput(global_step=977, training_loss=0.376339291659934, metrics={'train_runtime': 1899.1399, 'train_samples_per_second': 263.277, 'train_steps_per_second': 0.514, 'total_flos': 3.2888586624e+16, 'train_loss': 0.376339291659934, 'epoch': 1.0})

In [14]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 139032
  Batch size = 128


{'eval_loss': 0.0036597298458218575,
 'eval_mse': 0.003659711219370365,
 'eval_mae': 0.06042076647281647,
 'eval_runtime': 111.9866,
 'eval_samples_per_second': 1241.506,
 'eval_steps_per_second': 2.429,
 'epoch': 1.0}

In [15]:
trainer.evaluate(train_dataset)

***** Running Evaluation *****
  Num examples = 500000
  Batch size = 128


{'eval_loss': 0.0036606364883482456,
 'eval_mse': 0.0036606553476303816,
 'eval_mae': 0.06042814627289772,
 'eval_runtime': 429.9466,
 'eval_samples_per_second': 1162.935,
 'eval_steps_per_second': 2.272,
 'epoch': 1.0}

In [16]:
trainer.evaluate(gen_train_dtst)

***** Running Evaluation *****
  Num examples = 211692
  Batch size = 128


{'eval_loss': 0.004061573185026646,
 'eval_mse': 0.004061552230268717,
 'eval_mae': 0.06359144300222397,
 'eval_runtime': 167.7996,
 'eval_samples_per_second': 1261.576,
 'eval_steps_per_second': 2.467,
 'epoch': 1.0}

In [17]:
trainer.evaluate(gen_valid_dtst)

***** Running Evaluation *****
  Num examples = 26680
  Batch size = 128


{'eval_loss': 0.004069428890943527,
 'eval_mse': 0.004070572555065155,
 'eval_mae': 0.06367037445306778,
 'eval_runtime': 17.8615,
 'eval_samples_per_second': 1493.717,
 'eval_steps_per_second': 2.967,
 'epoch': 1.0}

In [18]:
trainer.evaluate(gen_test_dtst)

***** Running Evaluation *****
  Num examples = 26635
  Batch size = 128


{'eval_loss': 0.004060035105794668,
 'eval_mse': 0.004065202549099922,
 'eval_mae': 0.0636281669139862,
 'eval_runtime': 17.8206,
 'eval_samples_per_second': 1494.619,
 'eval_steps_per_second': 2.974,
 'epoch': 1.0}