# Process

Convert the Apect Term Extraction (ATE) sub problem as a sequence tagging problem.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [None]:
if IN_COLAB:
    import nltk
    nltk.download('punkt')
    !pip install transformers
    !pip install datasets
    !pip install seqeval
    !pip install evaluate
    !pip install sentencepiece

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 13.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 71.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 59.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 15.5 M

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
from logging import raiseExceptions

from nltk.tokenize import word_tokenize

from datasets import Dataset, DatasetDict, load_metric, Features, ClassLabel
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
if IN_COLAB:
    root_path = '/content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project'
else:
    root_path = os.getcwd()

In [None]:
# Create the data to bring it to the required format as required for sequence tagging problem.

data_path = os.path.join(root_path, 'data')

In [None]:
lapt14_train = pd.read_csv(os.path.join(data_path, 'towe', '14lap_train.tsv'), sep='\t')
lapt14_test = pd.read_csv(os.path.join(data_path, 'towe', '14lap_test.tsv'), sep='\t')

In [None]:
def preprocess(dataframe):
    dataframe['sentence_tokens'] = dataframe['sentence'].apply(lambda x: word_tokenize(x))

    dataframe['target_tags_'] = dataframe['target_tags'].apply(lambda x: [i.split('\\')[-1] for i in x.split()])
    dataframe['target_tags_'] = dataframe['target_tags_'].apply(lambda x: [idx for idx, i in enumerate(x) if i in ['B', 'I']])
    dataframe['aspects'] = dataframe[['sentence_tokens', 'target_tags_']].apply(lambda x: ' '.join([x[0][i] for i in x[1]]), axis=1)

    opinion_label_map = {'O':0, 'B':1, 'I':1}

    dataframe['tags'] = dataframe['opinion_words_tags'].apply(lambda x: [opinion_label_map[i.split('\\')[-1]] for i in x.split()])
    dataframe['opinion_words_tags_'] = dataframe['tags'].apply(lambda x: [idx for idx, i in enumerate(x) if i in [1]])
    dataframe['opinion_words'] = dataframe[['sentence_tokens', 'opinion_words_tags_']].apply(lambda x: ' '.join([x[0][i] for i in x[1]]), axis=1)

    # Encode the target aspect word into the sentence
    dataframe['sentence_token_len'] = dataframe['sentence'].apply(lambda x: len(word_tokenize(x)))
    dataframe['text'] = dataframe[['sentence', 'aspects']].apply(lambda x: x[0] + f" The aspect identified is: {x[1]}", axis=1)
    dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))
    dataframe['text_token_len'] = dataframe['tokens'].apply(lambda x: len(x))

    # Add additional 0 labels to opinion_words_tags_label to match the length of input text tokens
    dataframe['additional_tags_len'] = dataframe['text_token_len'] - dataframe['sentence_token_len']
    dataframe['tags'] = dataframe[['tags', 'additional_tags_len']].apply(lambda x: x[0] + [0]*x[1], axis=1)
    dataframe.drop(['additional_tags_len', 'text_token_len', 'sentence_token_len'], axis=1, inplace=True)

    # Removing files with incorrect tags and tokens length mismatch
    dataframe = dataframe[dataframe[['tags', 'tokens']].apply(lambda x: len(x[0])==len(x[1]), axis=1)]
    return dataframe

In [None]:
lapt14_train = preprocess(lapt14_train)
lapt14_test = preprocess(lapt14_test)

In [None]:
lapt14_train.head()

Unnamed: 0,s_id,sentence,target_tags,opinion_words_tags,sentence_tokens,target_tags_,aspects,tags,opinion_words_tags_,opinion_words,text,tokens
0,2339,I charge it at night and skip taking the cord ...,I\O charge\O it\O at\O night\O and\O skip\O ta...,I\O charge\O it\O at\O night\O and\O skip\O ta...,"[I, charge, it, at, night, and, skip, taking, ...","[16, 17]",battery life,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[15],good,I charge it at night and skip taking the cord ...,"[I, charge, it, at, night, and, skip, taking, ..."
1,2005,"it is of high quality , has a killer GUI , is ...","it\O is\O of\O high\O quality\B ,\O has\O a\O ...","it\O is\O of\O high\B quality\O ,\O has\O a\O ...","[it, is, of, high, quality, ,, has, a, killer,...",[4],quality,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[3],high,"it is of high quality , has a killer GUI , is ...","[it, is, of, high, quality, ,, has, a, killer,..."
2,2005,"it is of high quality , has a killer GUI , is ...","it\O is\O of\O high\O quality\O ,\O has\O a\O ...","it\O is\O of\O high\O quality\O ,\O has\O a\O ...","[it, is, of, high, quality, ,, has, a, killer,...",[9],GUI,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",[8],killer,"it is of high quality , has a killer GUI , is ...","[it, is, of, high, quality, ,, has, a, killer,..."
3,2005,"it is of high quality , has a killer GUI , is ...","it\O is\O of\O high\O quality\O ,\O has\O a\O ...","it\O is\O of\O high\O quality\O ,\O has\O a\O ...","[it, is, of, high, quality, ,, has, a, killer,...",[26],applications,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[25],good,"it is of high quality , has a killer GUI , is ...","[it, is, of, high, quality, ,, has, a, killer,..."
4,2005,"it is of high quality , has a killer GUI , is ...","it\O is\O of\O high\O quality\O ,\O has\O a\O ...","it\O is\O of\O high\O quality\O ,\O has\O a\O ...","[it, is, of, high, quality, ,, has, a, killer,...",[31],use,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[29],easy,"it is of high quality , has a killer GUI , is ...","[it, is, of, high, quality, ,, has, a, killer,..."


In [None]:
# lapt14_train['opinion_words_tags_']

In [None]:
# Create huggingface dataset
towedata = DatasetDict({'train': Dataset.from_pandas(lapt14_train[['tokens', 'tags']]), 'test': Dataset.from_pandas(lapt14_test[['tokens', 'tags']])})

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased")

In [None]:
# Align dataset tags and tokens to handle subwords
tokenized_towedata = towedata.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# Data collation for the task
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
model_out_path = os.path.join(root_path, 'towemodel')

In [None]:
label_list = ['O', 'B']
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir=model_out_path,
    evaluation_strategy="epoch",
    save_strategy='no',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_towedata["train"],
    eval_dataset=tokenized_towedata["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# Fit the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, tags. If tokens, __index_level_0__, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1631
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 408
  Number of trainable parameters = 108893186
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.076202,0.75,0.743243,0.746606,0.970289
2,No log,0.065945,0.773963,0.812312,0.792674,0.974976
3,No log,0.066177,0.812596,0.794294,0.803341,0.977098
4,No log,0.072096,0.81761,0.780781,0.798771,0.976833


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, tags. If tokens, __index_level_0__, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 479
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, tags. If tokens, __index_level_0__, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 479
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, tags. If tokens, __index_level_0__, tags are not expected by `BertForTokenClas

TrainOutput(global_step=408, training_loss=0.06275356049631156, metrics={'train_runtime': 93.6761, 'train_samples_per_second': 69.644, 'train_steps_per_second': 4.355, 'total_flos': 203736449505564.0, 'train_loss': 0.06275356049631156, 'epoch': 4.0})

In [None]:
trainer.save_model()

Saving model checkpoint to /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/towemodel
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/towemodel/config.json
Model weights saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/towemodel/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/towemodel/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/towemodel/special_tokens_map.json


In [None]:
tr_predictions, tr_labels, _ = trainer.predict(tokenized_towedata["train"])
predictions = np.argmax(tr_predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, tr_labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, tr_labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, tags. If tokens, __index_level_0__, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1631
  Batch size = 16


{'_': {'precision': 0.9223892726533929,
  'recall': 0.9684300341296929,
  'f1': 0.9448491155046826,
  'number': 2344},
 'overall_precision': 0.9223892726533929,
 'overall_recall': 0.9684300341296929,
 'overall_f1': 0.9448491155046826,
 'overall_accuracy': 0.9940034395365677}

In [None]:
te_predictions, te_labels, _ = trainer.predict(tokenized_towedata["test"])
predictions = np.argmax(te_predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, te_labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, te_labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, __index_level_0__, tags. If tokens, __index_level_0__, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 479
  Batch size = 16


{'_': {'precision': 0.8176100628930818,
  'recall': 0.7807807807807807,
  'f1': 0.7987711213517665,
  'number': 666},
 'overall_precision': 0.8176100628930818,
 'overall_recall': 0.7807807807807807,
 'overall_f1': 0.7987711213517665,
 'overall_accuracy': 0.9768326111946237}

In [None]:
idx = 244
tks = tokenized_towedata["test"][idx]['tokens']
preds = predictions[idx][:len(tks)+2][1:-1]
print('Labels: ', tokenized_towedata["test"][idx]['labels'][1:-1])
print('Preds: ', preds)
print(tks)
aspect_terms = [tks[idx] for idx, i in enumerate(preds) if i==1]
aspect_terms

Labels:  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Preds:  [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['I', 'needed', 'a', 'laptop', 'with', 'big', 'storage', ',', 'a', 'nice', 'screen', 'and', 'fast', 'so', 'I', 'can', 'photoshop', 'without', 'any', 'problem', '.', 'The', 'aspect', 'identified', 'is', ':', 'storage']


['big']

In [None]:
sample_text = 'The restaurant has an incredible selection of beverages. The aspect identified is beverages'
# sample_text = 'The cab ride was bumpy but the driver was very friendly. The aspect identified is cab ride'
# sample_text = 'The movie was not that great bu the actor was really handsome. The aspect identified is movie.'
tokens = word_tokenize(sample_text)
dummy_tags = [1]*len(tokens)
dummy_labels = [1]*len(tokens)
temp_df = pd.DataFrame({'tokens':[tokens], 'tags':[dummy_tags], 'labels':[dummy_tags]})

tokenized_df = Dataset.from_pandas(temp_df).map(tokenize_and_align_labels, batched=True)

preds_, labels_, _ = trainer.predict(tokenized_df)
preds_ = np.argmax(preds_, axis=2)


idx = 0
tks_ = tokenized_df[idx]['tokens']
print('Tokens: ', tks_)
preds_id = preds_[idx][:len(tks_)+2][1:-1]
print('Labels: ', tokenized_df[idx]['labels'][1:-1])
print('Preds: ', preds_id)
aspect_terms_sample = [tks_[idx] for idx, i in enumerate(preds_id) if i==1]
aspect_terms_sample

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 16


Tokens:  ['The', 'restaurant', 'has', 'an', 'incredible', 'selection', 'of', 'beverages', '.', 'The', 'aspect', 'identified', 'is', 'beverages']
Labels:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Preds:  [0 0 0 0 1 0 0 0 0 0 0 0 0 0]


['incredible']