# Imports

In [1]:
# !pip install -Iv transformers==4.38.1
# !pip install -Iv datasets==2.1.0
# !pip install -Iv tensorflow==2.15.0
# !pip install -Iv keras==3.0.5
# !pip install tf-keras
# !pip install -Iv nltk==3.2.4
# !pip install -Iv contractions==0.1.73
# !pip install -Iv accelerate==0.27.2
# !pip install -Iv scikit-learn==1.4.0

# !pip install datasets --upgrade

In [2]:
import pandas as pd
import os 
import sklearn
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset, load_metric
import datasets
import torch
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, DistilBertConfig, DistilBertForSequenceClassification, BertConfig, BertForSequenceClassification, create_optimizer, TrainingArguments, Trainer
from transformers.keras_callbacks import KerasMetricCallback
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import pipeline
import csv
import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
import contractions
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction



# Import Data

In [3]:
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

In [4]:
labelled_data = pd.read_csv(train_data_path)
labelled_data.head(5)

Unnamed: 0,text,stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,1
1,Terrible food. Terrible service.\n\nThe absol...,1
2,"So, right away if I go into a buffet setting, ...",2
3,I have gotten good cuts from this place. I eve...,3
4,I felt this place was a bit lackluster conside...,2


In [5]:
labelled_data['stars'] = labelled_data['stars'].replace({1: 'one_star', 2: 'two_stars', 3: 'three_stars'})

In [6]:
labelled_data.head(5)

Unnamed: 0,text,stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,one_star
1,Terrible food. Terrible service.\n\nThe absol...,one_star
2,"So, right away if I go into a buffet setting, ...",two_stars
3,I have gotten good cuts from this place. I eve...,three_stars
4,I felt this place was a bit lackluster conside...,two_stars


# Preprocessing

In [7]:
# one hot encode the labels 
one_hot_labels = pd.get_dummies(labelled_data['stars'])
# # drop the old 'stars' column to replace with the one hot encoded values 
# labelled_data = labelled_data.drop('stars', axis= 1)
# join with the one hot encoded 
labelled_data = labelled_data.join(one_hot_labels)

labelled_data.head(10)

Unnamed: 0,text,stars,one_star,three_stars,two_stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,one_star,True,False,False
1,Terrible food. Terrible service.\n\nThe absol...,one_star,True,False,False
2,"So, right away if I go into a buffet setting, ...",two_stars,False,False,True
3,I have gotten good cuts from this place. I eve...,three_stars,False,True,False
4,I felt this place was a bit lackluster conside...,two_stars,False,False,True
5,Back when I lived in Pollock Halls in my first...,three_stars,False,True,False
6,Short version:\n- 20 stars for having free vod...,two_stars,False,False,True
7,Okay.....read the other reviews about taking t...,one_star,True,False,False
8,"The food is really good, but we had one big co...",three_stars,False,True,False
9,"Cool place, comfortable. Sorely needed in the ...",three_stars,False,True,False


In [8]:
# # 1. Expand Contractions using contractions library
# def expand_contractions(text):
#     return contractions.fix(text)

# # Apply expand_contractions function to 'text' column
# labelled_data['text'] = labelled_data['text'].apply(expand_contractions)

# # 2. Lower Case
# labelled_data['text'] = labelled_data['text'].str.lower()

# # 3. Remove Punctuations
# labelled_data['text'] = labelled_data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# # 4. Remove Extra Spaces
# labelled_data['text'] = labelled_data['text'].apply(lambda x: re.sub(' +', ' ', x.strip()))

In [9]:
# # Download the WordNet resource
# nltk.download('stopwords')

In [10]:
# # 5. Remove Stopwords
# stop_words = set(stopwords.words('english'))
# labelled_data['text'] = labelled_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

In [11]:
# # Download the WordNet resource
# nltk.download('wordnet')

In [12]:
# !unzip /home/ec2-user/nltk_data/corpora/wordnet.zip -d /home/ec2-user/nltk_data/corpora/

In [13]:
# # 6. Stemming and Lemmatization
# lemmatizer = WordNetLemmatizer()
# def lemmatize_words(text):
#     return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
# labelled_data['text'] = labelled_data['text'].apply(lemmatize_words)

In [14]:
# labelled_data.head(10)

# Split Data

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(
    labelled_data['text'], 
    labelled_data[['one_star', 'two_stars', 'three_stars']], 
    test_size= 0.001, 
    random_state= 42, 
    stratify= labelled_data['stars']
)

In [16]:
print(y_train.value_counts())
print(y_valid.value_counts())

one_star  two_stars  three_stars
False     False      True           1998
          True       False          1998
True      False      False          1998
Name: count, dtype: int64
one_star  two_stars  three_stars
False     False      True           2
          True       False          2
True      False      False          2
Name: count, dtype: int64


In [17]:
# X_train

In [18]:
# y_train

In [19]:
train_ind = X_train.index.values.tolist()
valid_ind = X_valid.index.values.tolist()

train_data = {'text': [], 'one_star': [], 'two_stars': [], 'three_stars': []}
valid_data = {'text': [], 'one_star': [], 'two_stars': [], 'three_stars': []}

for ti in train_ind:
    train_data['text'].append(X_train[ti])
    train_data['one_star'].append(y_train.loc[ti]['one_star'])
    train_data['two_stars'].append(y_train.loc[ti]['two_stars'])
    train_data['three_stars'].append(y_train.loc[ti]['three_stars'])

for vi in valid_ind:
    valid_data['text'].append(X_valid[vi])
    valid_data['one_star'].append(y_valid.loc[vi]['one_star'])
    valid_data['two_stars'].append(y_valid.loc[vi]['two_stars'])
    valid_data['three_stars'].append(y_valid.loc[vi]['three_stars'])
    
print(len(train_data['text']))
print(len(valid_data['text']))

5994
6


In [20]:
train_dataset = Dataset.from_dict(train_data)
valid_dataset = Dataset.from_dict(valid_data)
data = datasets.DatasetDict({"train": train_dataset,"valid": valid_dataset})

In [21]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'one_star', 'two_stars', 'three_stars'],
        num_rows: 5994
    })
    valid: Dataset({
        features: ['text', 'one_star', 'two_stars', 'three_stars'],
        num_rows: 6
    })
})

In [22]:
data['train'][0]

{'text': "I live down the street from this place and wish for nothing more than for it to be better.  I was excited to hear the items on the menu.  I am a sucker for a sandwich with lots of abstract foods on them.  I have tried The Student and The Professor on multiple occasions.  Every time I just wish they had a different sauce on them and more of it.  The sandwiches are dry and lacking in flavor.  If you ask them to put more sauce on (which I did after the first time), they charge you extra.  I dig the creativity but it is lacking in everything else.  \\n\\nThe wait is also ridiculous.  It takes almost 20 mins each time I've been there for them to make the sandwich even if there's no one else there.  Not to mention the people who work there are rude.",
 'one_star': False,
 'two_stars': True,
 'three_stars': False}

In [23]:
labels = [label for label in data['train'].features.keys() if label not in ['text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['one_star', 'two_stars', 'three_stars']

In [24]:
id2label

{0: 'one_star', 1: 'two_stars', 2: 'three_stars'}

# Tokenizer

In [25]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [26]:
tokenized_data = data.map(preprocess_data, batched=True, remove_columns= data['train'].column_names)

Map:   0%|          | 0/5994 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [27]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5994
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6
    })
})

In [28]:
tokenized_data['train'][0]['labels']

[0.0, 1.0, 0.0]

In [29]:
tokenized_data.set_format("torch")

# Data Collator

In [30]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model Building

In [31]:
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Model Pt.1

In [32]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

In [33]:
batch_size = 32
metric_name = "f1"

In [56]:
args = TrainingArguments(
    f"bert-model-stat940-roberto",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-9, # 2e-5
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [35]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

# Verify a batch and forward pass

In [36]:
tokenized_data['train'][0]['labels'].type()

'torch.FloatTensor'

In [37]:
tokenized_data['train']['input_ids'][0]

tensor([    0,   100,   697,   159,     5,  2014,    31,    42,   317,     8,
         2813,    13,  1085,    55,    87,    13,    24,     7,    28,   357,
            4,  1437,    38,    21,  2283,     7,  1798,     5,  1964,    15,
            5,  5765,     4,  1437,    38,   524,    10, 40454,    13,    10,
        15649,    19,  3739,     9, 20372,  6592,    15,   106,     4,  1437,
           38,    33,  1381,    20,  9067,     8,    20,  6020,    15,  1533,
         7657,     4,  1437,  4337,    86,    38,    95,  2813,    51,    56,
           10,   430,  8929,    15,   106,     8,    55,     9,    24,     4,
         1437,    20, 19072,    32,  3841,     8, 12622,    11, 12117,     4,
         1437,   318,    47,  1394,   106,     7,   342,    55,  8929,    15,
           36,  5488,    38,   222,    71,     5,    78,    86,   238,    51,
         1427,    47,  1823,     4,  1437,    38,  8512,     5, 11140,    53,
           24,    16, 12622,    11,   960,  1493,     4,     2])

In [38]:
#forward pass
outputs = model(input_ids=tokenized_data['train']['input_ids'][0].unsqueeze(0), labels=tokenized_data['train'][0]['labels'].unsqueeze(0))
outputs

Seq2SeqSequenceClassifierOutput(loss=tensor(0.6300, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0686,  0.2814, -0.0505]], grad_fn=<AddmmBackward0>), past_key_values=None, decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=tensor([[[-0.0333,  0.0095, -0.0021,  ...,  0.0112, -0.0015, -0.0075],
         [ 0.1670,  0.3363,  0.0336,  ..., -0.0555, -0.2182,  0.4453],
         [-0.0161,  0.2885,  0.0080,  ...,  0.0285,  0.0655, -0.1562],
         ...,
         [ 0.0125, -0.2285, -0.0881,  ..., -0.1711, -0.6356,  0.4325],
         [ 0.0657,  0.0190,  0.1704,  ...,  0.0559, -0.2105,  0.1363],
         [ 0.0725,  0.0399,  0.2782,  ...,  0.0220, -0.1229,  0.3979]]],
       grad_fn=<NativeLayerNormBackward0>), encoder_hidden_states=None, encoder_attentions=None)

# # Train Model Pt.2

In [57]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [58]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.192357,0.909091,0.916667,0.833333
2,No log,0.192342,0.909091,0.916667,0.833333
3,0.404700,0.192345,0.909091,0.916667,0.833333
4,0.404700,0.19234,0.909091,0.916667,0.833333
5,0.404700,0.192342,0.909091,0.916667,0.833333
6,0.427700,0.192341,0.909091,0.916667,0.833333
7,0.427700,0.192336,0.909091,0.916667,0.833333
8,0.428800,0.192336,0.909091,0.916667,0.833333
9,0.428800,0.192337,0.909091,0.916667,0.833333
10,0.428800,0.192337,0.909091,0.916667,0.833333


Checkpoint destination directory bert-model-stat940-roberto/checkpoint-188 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Checkpoint destination directory bert-model-stat940-roberto/checkpoint-376 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Checkpoint destination directory bert-model-stat940-roberto/checkpoint-564 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Checkpoint destination directory bert-model-stat9

TrainOutput(global_step=1880, training_loss=0.4220225313876538, metrics={'train_runtime': 1035.4139, 'train_samples_per_second': 57.89, 'train_steps_per_second': 1.816, 'total_flos': 4595743878497280.0, 'train_loss': 0.4220225313876538, 'epoch': 10.0})

# Save Model

In [None]:
# model.save_pretrained('models')

In [57]:
trainer.evaluate()

{'eval_loss': 0.7568681240081787,
 'eval_f1': 0.6399331662489557,
 'eval_roc_auc': 0.7299999999999999,
 'eval_accuracy': 0.635}

# Inference

In [59]:
test_data = pd.read_csv(test_data_path)
test_data.head(5)

Unnamed: 0,ID,text
0,1,\n\nDuring my recent company trip to our Tempe...
1,2,\n\nAfter hearing about Pauly D from Jersey Sh...
2,3,\n\nI had high hopes for this restaurant based...
3,4,We experienced overpriced and underwhelming en...
4,5,\n\nThe windows from this company are definite...


In [60]:
output_path = 'bart_base_v03.csv'

In [61]:
count = 0
error_ind = []
with open(output_path, "w", newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["ID","Label"])
    for index, row in test_data.iterrows():
        text_id = row['ID']
        text = row['text']
        encoding = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
        # print(encoding)
        # print(encoding['attention_mask'].size())
        try:
            outputs = trainer.model(**encoding)
        except: 
            count += 1
            error_ind.append(text_id)
        logits = outputs.logits
        # apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits.squeeze().cpu())
        predictions = np.zeros(probs.shape)
        predictions[np.argmax(probs.detach().numpy())] = 1
        # turn predicted id's into actual label names
        predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
        
        if predicted_labels[0] == 'one_star':
            conv_pred = 1
        elif predicted_labels[0] == 'two_stars':
            conv_pred = 2
        else: 
            conv_pred = 3

        
        csv_writer.writerow([text_id,conv_pred])
    print(count)

0
