# Failure Identification In Customer Reviews

This notebook contains the code used to fine-tune several transformer-based models.

## 1. Import packages and load data

In [11]:
import datasets
import evaluate
import numpy as np
import pandas as pd
import torch

from IPython.display import display
from sklearn.metrics import balanced_accuracy_score
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorWithPadding
from torch import nn
from transformers import pipeline

torch.device("cuda")

device(type='cuda')

In [5]:
# Load data as data frames
df_train = pd.read_csv("training_data.csv")
display(df_train)

Unnamed: 0,review_id,tablet_id,comment,stars,Failure class
0,Big_dataset_1,Big_dataset,Customer support wants to take the computer ba...,1,TF
1,Big_dataset_2,Big_dataset,"Overall, a good experience. Fast and responsiv...",3,TF
2,Big_dataset_3,Big_dataset,I got this Chromebook recent for a trip I need...,5,
3,Big_dataset_4,Big_dataset,Just what I wanted\n,5,
4,Big_dataset_5,Big_dataset,"Good battery life, thin and lightweight, handy...",4,TF
...,...,...,...,...,...
1210,Big_dataset_1211,Big_dataset,Way to go Asus! This chromebook is so beautif...,5,
1211,Big_dataset_1212,Big_dataset,The very first Chromebook I ever owned was the...,4,
1212,Big_dataset_1213,Big_dataset,"Snappy little chromebook!! It's light weight, ...",5,
1213,Big_dataset_1214,Big_dataset,This laptop is truly amazing. I spent a ton of...,5,


In [6]:
# Load data as huggingface datasets
ds = datasets.load_dataset(
    "csv",
    data_files={
        "train": "training_data.csv",
        "val": "validation_data.csv",
        "test": "test_data.csv",
    },
)

# Add a column for the label (1=failure, 0=non-failure)
def add_label(data):
    return {
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }
ds = ds.map(add_label, batched=True)
ds

Using custom data configuration default-ef919baa748c99f1
Found cached dataset csv (/home/soukaina/.cache/huggingface/datasets/csv/default-ef919baa748c99f1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 3/3 [00:00<00:00, 213.19it/s]
Loading cached processed dataset at /home/soukaina/.cache/huggingface/datasets/csv/default-ef919baa748c99f1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d1f15d554317dd5e.arrow
Loading cached processed dataset at /home/soukaina/.cache/huggingface/datasets/csv/default-ef919baa748c99f1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-8f6efcba5ff496ee.arrow
Loading cached processed dataset at /home/soukaina/.cache/huggingface/datasets/csv/default-ef919baa748c99f1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5e799483a3516507.arrow


DatasetDict({
    train: Dataset({
        features: ['review_id', 'tablet_id', 'comment', 'stars', 'Failure class', 'label'],
        num_rows: 1215
    })
    val: Dataset({
        features: ['review_id', 'tablet_id', 'comment', 'stars', 'Failure class', 'label'],
        num_rows: 600
    })
    test: Dataset({
        features: ['review_id', 'tablet_id', 'comment', 'stars', 'Failure class', 'label'],
        num_rows: 600
    })
})

## 2. Helper functions

Here, we define functions which will help use finetune and evaluate the models

In [7]:
# Load a set of pretrained weights for the distilbert transformer model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
# Define the balanced accuracy metric in a manner compatible with hugginface's
# `evaluate` library
class BalancedAccuracy(evaluate.EvaluationModule):
    def _info(self):
        return evaluate.MetricInfo(
            description="",
            citation="",
            features=datasets.Features(
                {
                    "predictions": datasets.Value("int32"),
                    "references": datasets.Value("int32"),
                }
            ),
        )

    def _compute(self, predictions, references):
        return {
            "balanced_accuracy": float(
                balanced_accuracy_score(references, predictions)
            ),
        }


# The metrics which we'll use to evaluate the models: accuracy, f1, balanced accuracy
metrics = evaluate.combine(["accuracy", "f1", BalancedAccuracy()])

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 3.56MB/s]


In [10]:
def compute_metrics(eval_pred):
    """Computes the metrics given the model output"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)


def run_training(
    model,
    *,
    output_dir,
    ds_,
    tokenizer,
    data_collator,
    lr=2e-5,
    batch_size=32,
    gradient_accumulation_steps=1,
    epochs=6,
    make_trainer=Trainer,
):
    """Fine-tunes a model on the training set with the specified data
    and validates it on the validation set."""
    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        evaluation_strategy="epoch",
        num_train_epochs=epochs,
        weight_decay=0.01,
        log_level="warning",
        logging_strategy="no",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
    )
    trainer = make_trainer(
        model=model,
        args=training_args,
        train_dataset=ds_["train"],
        eval_dataset=ds_["val"],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

## 3. Fine-tuning different models

For each model, we ran started with a pretrained model and fine-tuned it on our dataset.
We ran each model 5 times to take into account the possible variance of the training process.

### 3.1. Distilbert model (dirty version)

- Pretrained model name: `distilbert-base-uncased`
- Link: <https://huggingface.co/distilbert-base-uncased>
- Preprocessing:
  - Strip spaces
  - During the tokenization process, the review is truncated to 512 tokens

In [None]:
def preprocess_comment(c: str) -> str:
    return c.strip()


def preprocess_function(data):
    comments = list(map(preprocess_comment, data["comment"]))
    return {
        **tokenizer(comments, truncation=True),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }


ds_base_noclean = ds.map(preprocess_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
for i in range(5):
    run_training(
        model,
        output_dir=f"./results-base-noclean{i}",
        ds_=ds_base_noclean,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

### 3.2. Distilbert model (clean version)

- Pretrained model name: `distilbert-base-uncased`
- Link: <https://huggingface.co/distilbert-base-uncased>
- Preprocessing:
  - Remove the string \_x000D\_ present at the start end end of some reviews
  - Strip spaces
  - During the tokenization process, the review is truncated to 512 tokens

In [None]:
def preprocess_comment(c: str) -> str:
    return c.replace("_x000D_", "").strip()


def preprocess_function(data):
    comments = list(map(preprocess_comment, data["comment"]))
    return {
        **tokenizer(comments, truncation=True),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }


ds_base = ds.map(preprocess_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
for i in range(5):
    run_training(
        model,
        output_dir=f"./results-base{i}",
        ds_=ds_base,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

### 3.3. Distilbert with score

- Pretrained model name: `distilbert-base-uncased`
- Link: <https://huggingface.co/distilbert-base-uncased>
- Preprocessing:
  - Remove the string \_x000D\_ present at the start end end of some reviews
  - Strip spaces
  - Add the score to the review text. For example, the review "I like this product" with 4 stars becomes "Score: 4. I like this product".
  - During the tokenization process, the review is truncated to 512 tokens

In [None]:
def preprocess_comment(c: str, stars: int) -> str:
    c = c.replace("_x000D_", "").strip()
    return f"Score: {stars}. {c}"


def preprocess_function(data):
    comments = [
        preprocess_comment(c, stars) for c, stars in zip(data["comment"], data["stars"])
    ]
    return {
        **tokenizer(comments, truncation=True),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }


ds_score = ds.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
for i in range(5):
    run_training(
        model,
        output_dir=f"./results-score{i}",
        ds_=ds_score,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

### 3.4. Distilbert with truncation at the middle of the review (with score)

- Pretrained model name: `distilbert-base-uncased`
- Link: <https://huggingface.co/distilbert-base-uncased>
- Preprocessing:
  - Remove the string \_x000D\_ present at the start end end of some reviews
  - Strip spaces
  - Add the score to the review text. For example, the review "I like this product" with 4 stars becomes "Score: 4. I like this product".
  - Before the tokenization proocess, we split the text in sentences and remove sentences from the middle of the review as needed to ensure that the number of tokens is at most 512. In particular:
    - We always remove whole sentences, never single words
    - We keep the start and end of each review, which appear (based on a quick manual analysis of the reviews) to be the most important parts of a review
- Notes:
  - For our dataset, removing the middle of a review yields worse performance that removing the end of a review

In [None]:
def preprocess_comment(c: str, stars: int) -> str:
    c = c.replace("_x000D_", "").strip()
    sentences = c.split(".")
    start = sentences[:len(sentences) // 2]
    end = sentences[len(sentences) // 2:]

    while True:
        text = f'Score: {stars}. ' + '.'.join(start + end)
        if len(tokenizer(text, return_attention_mask=False, verbose=False)['input_ids']) <= 512:
            return text

        if len(start) > len(end):
            start.pop()
        else:
            end.pop(0)

def preprocess_function(data):
    comments = [preprocess_comment(c, stars) for c, stars in zip(data["comment"], data["stars"])]
    return {
        **tokenizer(comments, truncation=True),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }
    
ds_truncate_middle = ds.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
for i in range(5):
    run_training(
        model,
        output_dir=f"./results-truncate-middle-score{i}",
        ds_=ds_truncate_middle,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

### 3.5. Roberta with score

- Pretrained model name: `roberta-base`
- Link: <https://huggingface.co/roberta-base>
- Preprocessing:
  - Remove the string \_x000D\_ present at the start end end of some reviews
  - Strip spaces
  - Add the score to the review text. For example, the review "I like this product" with 4 stars becomes "Score: 4. I like this product".
  - During the tokenization process, the review is truncated to 512 tokens

In [None]:
def preprocess_comment(c: str, stars: int) -> str:
    c = c.replace("_x000D_", "").strip()
    return f"Score: {stars}. {c}"


def preprocess_function(data):
    comments = [
        preprocess_comment(c, stars) for c, stars in zip(data["comment"], data["stars"])
    ]
    return {
        **tokenizer_roberta(comments, truncation=True),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }

tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")
data_collator_roberta = DataCollatorWithPadding(tokenizer=tokenizer_roberta)

ds_score = ds.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2
)
for i in range(2, 5):
    run_training(
        model,
        output_dir=f"./results-roberta{i}",
        ds_=ds_score,
        tokenizer=tokenizer_roberta,
        data_collator=data_collator_roberta,
        batch_size=8,
        gradient_accumulation_steps=4,
    )

### 3.6. Roberta for sentiment analysis with score

- Pretrained model name: `cardiffnlp/twitter-roberta-base-sentiment-latest`
  - A version of the roberta model which has been pretrained on tweets and fine-tuned for sentiment analysis
- Link: <https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest>
- Preprocessing:
  - Remove the string \_x000D\_ present at the start end end of some reviews
  - Strip spaces
  - Add the score to the review text. For example, the review "I like this product" with 4 stars becomes "Score: 4. I like this product".
  - During the tokenization process, the review is truncated to 512 tokens

In [None]:
def preprocess_comment(c: str, stars: int) -> str:
    c = c.replace("_x000D_", "").strip()
    return f"Score: {stars}. {c}"


def preprocess_function(data):
    comments = [
        preprocess_comment(c, stars) for c, stars in zip(data["comment"], data["stars"])
    ]
    return {
        **tokenizer_roberta_sent(comments, truncation=True, max_length=512),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }


tokenizer_roberta_sent = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
data_collator_roberta_sent = DataCollatorWithPadding(tokenizer=tokenizer_roberta_sent)
ds_score = ds.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest", num_labels=2,
    ignore_mismatched_sizes=True,
)
for i in range(3, 5):
    run_training(
        model,
        output_dir=f"./results-roberta-sent{i}",
        ds_=ds_score,
        tokenizer=tokenizer_roberta_sent,
        data_collator=data_collator_roberta_sent,
        batch_size=8,
        gradient_accumulation_steps=4,
    )

## 4. Final Model

The best model was [3.6. Roberta for sentiment analysis with score](#36-roberta-for-sentiment-analysis-with-score). We trained this model on the whole combined train+validation dataset and used it to generate predictions for the test data (evaluation.csv).

In [None]:
# Retrain the model on the combined train+validation data

def preprocess_comment(c: str, stars: int) -> str:
    c = c.replace("_x000D_", "").strip()
    return f"Score: {stars}. {c}"


def preprocess_function(data):
    comments = [
        preprocess_comment(c, stars) for c, stars in zip(data["comment"], data["stars"])
    ]
    return {
        **tokenizer_roberta_sent(comments, truncation=True, max_length=512),
        "label": [int(c in ["TF", "IF"]) for c in data["Failure class"]],
    }


tokenizer_roberta_sent = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
data_collator_roberta_sent = DataCollatorWithPadding(tokenizer=tokenizer_roberta_sent)
ds_score = ds.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest", num_labels=2,
    ignore_mismatched_sizes=True,
)
ds_score['train'] = datasets.concatenate_datasets([ds_score['train'], ds_score['val']])
ds_score['val'] = ds_score['train']
run_training(
    model,
    output_dir=f"./results-roberta-full",
    ds_=ds_score,
    tokenizer=tokenizer_roberta_sent,
    data_collator=data_collator_roberta_sent,
    epochs=6,
    batch_size=8,
    gradient_accumulation_steps=4,
)

In [None]:
# Create a classification pipeline for easy prediction
clf = pipeline('text-classification', model='results-roberta-full/checkpoint-336')

In [None]:
# Preprocess the test reviews

def preprocess_comment(c: str, stars: int) -> str:
    c = c.replace("_x000D_", "").strip()
    return f"Score: {stars}. {c}"


def preprocess_function(data):
    return {
        'preprocessed_comment': [preprocess_comment(c, stars) for c, stars in zip(data["comment"], data["stars"])]
    }

ds_test = ds['test'].map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# Generate the model predictions (probabilities)

def generate_probs(data):
    results = clf(data['preprocessed_comment'], truncation=True, max_length=512)
    probs = []
    for row in results:
        if row['label'] == 'LABEL_0':
            probs.append(1-row['score'])
        elif row['label'] == 'LABEL_1':
            probs.append(row['score'])
        else:
            raise ValueError('got label == ' + str(row['label']))
    return {'prob_failure': probs, 'pred_label': [int(round(p)) for p in probs]}

ds_test = ds_test.map(generate_probs, batched=True)
ds_test

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['review_id', 'tablet_id', 'comment', 'stars', 'Failure class', 'label', 'preprocessed_comment', 'prob_failure', 'pred_label'],
    num_rows: 600
})

In [None]:
# Save the labels

with open('evaluation_predictions.txt', 'w') as f:
    f.write('\n'.join(map(str, ds_test['pred_label'])))