# Process

Convert the Apect Term Extraction (ATE) sub problem as a sequence tagging problem.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [None]:
if IN_COLAB:
    import nltk
    nltk.download('punkt')
    !pip install transformers
    !pip install datasets
    !pip install seqeval
    !pip install evaluate
    !pip install sentencepiece

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 30.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 67.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.0-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm

from nltk.tokenize import word_tokenize

from datasets import Dataset, DatasetDict, load_metric, Features, ClassLabel
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
if IN_COLAB:
    root_path = '/content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project'
else:
    root_path = os.getcwd()

In [None]:
# Create the data to bring it to the required format as required for sequence tagging problem.

data_path = os.path.join(root_path, 'data')

In [None]:
def parse_xml_data(root):
    """Parses xml data to extract aspect terms and aspect categories from each sentence"""
    
    reviews = []

    for sentence in root.findall("sentence"):
        entry = {}
        aspect_terms = []
        aspect_categories = []
        
        if sentence.find("aspectTerms"):
            for aspect_term in sentence.find("aspectTerms").findall("aspectTerm"):
                aspect_terms.append((aspect_term.get("term"), aspect_term.get("polarity")))
                
        if sentence.find("aspectCategories"):
            for aspect_category in sentence.find("aspectCategories").findall("aspectCategory"):
                aspect_categories.append((aspect_category.get("category"), aspect_category.get("polarity")))
                
        entry["text"] = sentence[0].text
        entry["terms"] = aspect_terms
        entry["aspects"] = aspect_categories
        
        reviews.append(entry)

    reviews_df = pd.DataFrame(reviews)
    
    return reviews_df

In [None]:
res14_train_tree = ET.parse(os.path.join(data_path, 'raw', '14res_train.xml'))
root = res14_train_tree.getroot()
res14tr = parse_xml_data(root)
res14tr.to_pickle(os.path.join(data_path, 'xml2csv', '14res_train.pkl'))

res14_test_tree = ET.parse(os.path.join(data_path, 'raw', '14res_test.xml'))
root = res14_test_tree.getroot()
res14te = parse_xml_data(root)
res14te.to_pickle(os.path.join(data_path, 'xml2csv', '14res_test.pkl'))

In [None]:
res14tr = pd.read_pickle(os.path.join(data_path, 'xml2csv', '14res_train.pkl'))
res14te = pd.read_pickle(os.path.join(data_path, 'xml2csv', '14res_test.pkl'))

In [None]:
res14tr['terms_list'] = res14tr['terms'].apply(lambda x: [i[0] for i in x] if len(x)>0 else [])
res14te['terms_list'] = res14te['terms'].apply(lambda x: [i[0] for i in x] if len(x)>0 else [])

In [None]:
res14tr.head()

Unnamed: 0,text,terms,aspects,terms_list
0,But the staff was so horrible to us.,"[(staff, negative)]","[(service, negative)]",[staff]
1,"To be completely fair, the only redeeming fact...","[(food, positive)]","[(food, positive), (anecdotes/miscellaneous, n...",[food]
2,"The food is uniformly exceptional, with a very...","[(food, positive), (kitchen, positive), (menu,...","[(food, positive)]","[food, kitchen, menu]"
3,Where Gabriela personaly greets you and recomm...,[],"[(service, positive)]",[]
4,"For those that go once and don't enjoy it, all...",[],"[(anecdotes/miscellaneous, positive)]",[]


In [None]:
def add_tags(dataframe):
    tokens_list, tags_list = [], []
    for idx, asp_terms in tqdm(enumerate(dataframe['terms_list'].iloc[:])):
        sent = dataframe['text'].iloc[idx].replace("'", "")
        sent_tokens = word_tokenize(sent.lower())
        tags = [0]*len(sent_tokens)
        for asp_term in asp_terms:
            if asp_term.lower() in sent_tokens:
                req_idx = sent_tokens.index(asp_term.lower())
                tags[req_idx] = 1
        tokens_list.append(sent_tokens)
        tags_list.append(tags)

    dataframe['tokens'] = tokens_list
    dataframe['tags'] = tags_list
    return dataframe

In [None]:
# Add sequence label tags
res14tr = add_tags(res14tr)
res14te = add_tags(res14te)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
# Check tagging
idx = 1000
res14tr['terms_list'].iloc[idx], res14tr['text'].iloc[idx], res14tr['tokens'].iloc[idx], res14tr['tags'].iloc[idx]

(['dinner'],
 "I'd call it an 'italian dinner'.",
 ['id', 'call', 'it', 'an', 'italian', 'dinner', '.'],
 [0, 0, 0, 0, 0, 1, 0])

In [None]:
# Create huggingface dataset
atedata = DatasetDict({'train': Dataset.from_pandas(res14tr[['tokens', 'tags']]), 'test': Dataset.from_pandas(res14te[['tokens', 'tags']])})

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
# Align dataset tags and tokens to handle subwords
tokenized_atedata = atedata.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# Data collation for the task
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
model_out_path = os.path.join(root_path, 'atemodel')

In [None]:
label_list = ['O', 'AT']
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

  


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir=model_out_path,
    evaluation_strategy="epoch",
    save_strategy='no',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_atedata["train"],
    eval_dataset=tokenized_atedata["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# Fit the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3041
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 764
  Number of trainable parameters = 108893186


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.042667,0.871981,0.895782,0.883721,0.984874
2,No log,0.042054,0.905732,0.882134,0.893777,0.986564
3,0.017400,0.048119,0.872216,0.923077,0.896926,0.986403
4,0.017400,0.05188,0.886724,0.903226,0.894899,0.986403


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 800
  Ba

TrainOutput(global_step=764, training_loss=0.013457335882786057, metrics={'train_runtime': 123.7757, 'train_samples_per_second': 98.275, 'train_steps_per_second': 6.172, 'total_flos': 244422702117408.0, 'train_loss': 0.013457335882786057, 'epoch': 4.0})

In [None]:
trainer.save_model()

Saving model checkpoint to /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/atemodel
Configuration saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/atemodel/config.json
Model weights saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/atemodel/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/atemodel/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Knowledge/MSIT/IFT598 - NLP/Final Project/atemodel/special_tokens_map.json


In [None]:
tr_predictions, tr_labels, _ = trainer.predict(tokenized_atedata["train"])
predictions = np.argmax(tr_predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, tr_labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, tr_labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3041
  Batch size = 16




{'T': {'precision': 0.9992655159750276,
  'recall': 0.9974340175953079,
  'f1': 0.9983489268024216,
  'number': 2728},
 'overall_precision': 0.9992655159750276,
 'overall_recall': 0.9974340175953079,
 'overall_f1': 0.9983489268024216,
 'overall_accuracy': 0.999805741420246}

In [None]:
te_predictions, te_labels, _ = trainer.predict(tokenized_atedata["test"])
predictions = np.argmax(te_predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, te_labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, te_labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 800
  Batch size = 16




{'T': {'precision': 0.8867235079171741,
  'recall': 0.9032258064516129,
  'f1': 0.8948985863552551,
  'number': 806},
 'overall_precision': 0.8867235079171741,
 'overall_recall': 0.9032258064516129,
 'overall_f1': 0.8948985863552551,
 'overall_accuracy': 0.9864027677206533}

In [None]:
idx = 245
tks = tokenized_atedata["test"][idx]['tokens']
preds = predictions[idx][:len(tks)+2][1:-1]
print('Labels: ', tokenized_atedata["test"][idx]['labels'][1:-1])
print('Preds: ', preds)
print(tks)
aspect_terms = [tks[idx] for idx, i in enumerate(preds) if i==1]
aspect_terms

Labels:  [0, 1, 0, 0, 0, 0, 1, 0, 0, 0]
Preds:  [0 1 0 0 0 0 1 0 0 0]
['the', 'food', 'is', 'great', 'and', 'the', 'prices', 'are', 'reasonable', '.']


['food', 'prices']

In [None]:
sample_text = 'The restaurant has an incredible selection of beverages.'
tokens = word_tokenize(sample_text)
dummy_tags = [1]*len(tokens)
dummy_labels = [1]*len(tokens)
temp_df = pd.DataFrame({'tokens':[tokens], 'tags':[dummy_tags], 'labels':[dummy_tags]})

tokenized_df = Dataset.from_pandas(temp_df).map(tokenize_and_align_labels, batched=True)

preds_, labels_, _ = trainer.predict(tokenized_df)
preds_ = np.argmax(preds_, axis=2)


idx = 0
tks_ = tokenized_df[idx]['tokens']
print('Tokens: ', tks_)
preds_id = preds_[idx][:len(tks_)+2][1:-1]
print('Labels: ', tokenized_df[idx]['labels'][1:-1])
print('Preds: ', preds_id)
aspect_terms_sample = [tks_[idx] for idx, i in enumerate(preds_id) if i==1]
aspect_terms_sample

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 16


Tokens:  ['The', 'restaurant', 'has', 'an', 'incredible', 'selection', 'of', 'beverages', '.']
Labels:  [1, 1, 1, 1, 1, 1, 1, 1, 1]
Preds:  [0 0 0 0 0 0 0 1 0]


['beverages']