# Imports

In [2]:
# !pip install -Iv transformers==4.38.1
# !pip install -Iv datasets==2.1.0
# !pip install -Iv tensorflow==2.15.0
# !pip install -Iv keras==3.0.5
# !pip install tf-keras
# !pip install -Iv nltk==3.2.4
# !pip install -Iv contractions==0.1.73
# !pip install -Iv accelerate==0.27.2
# !pip install -Iv scikit-learn==1.4.0

# !pip install datasets --upgrade
# !pip install nltk --upgrade

In [3]:
import pandas as pd
import os 
import sklearn
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset, load_metric
import datasets
import torch
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, DistilBertConfig, DistilBertForSequenceClassification, BertConfig, BertForSequenceClassification, create_optimizer, TrainingArguments, Trainer
from transformers.keras_callbacks import KerasMetricCallback
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import pipeline
import csv
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import string



# Import Data

In [4]:
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'

In [5]:
labelled_data = pd.read_csv(train_data_path)
labelled_data.head(5)

Unnamed: 0,text,stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,1
1,Terrible food. Terrible service.\n\nThe absol...,1
2,"So, right away if I go into a buffet setting, ...",2
3,I have gotten good cuts from this place. I eve...,3
4,I felt this place was a bit lackluster conside...,2


In [6]:
labelled_data['stars'] = labelled_data['stars'].replace({1: 'one_star', 2: 'two_stars', 3: 'three_stars'})

In [7]:
labelled_data.head(5)

Unnamed: 0,text,stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,one_star
1,Terrible food. Terrible service.\n\nThe absol...,one_star
2,"So, right away if I go into a buffet setting, ...",two_stars
3,I have gotten good cuts from this place. I eve...,three_stars
4,I felt this place was a bit lackluster conside...,two_stars


# Preprocessing

In [8]:
# one hot encode the labels 
one_hot_labels = pd.get_dummies(labelled_data['stars'])
# # drop the old 'stars' column to replace with the one hot encoded values 
# labelled_data = labelled_data.drop('stars', axis= 1)
# join with the one hot encoded 
labelled_data = labelled_data.join(one_hot_labels)

labelled_data.head(10)

Unnamed: 0,text,stars,one_star,three_stars,two_stars
0,WILL NEVER COME BACK! HORRIBLE SERVICE & NASTY...,one_star,True,False,False
1,Terrible food. Terrible service.\n\nThe absol...,one_star,True,False,False
2,"So, right away if I go into a buffet setting, ...",two_stars,False,False,True
3,I have gotten good cuts from this place. I eve...,three_stars,False,True,False
4,I felt this place was a bit lackluster conside...,two_stars,False,False,True
5,Back when I lived in Pollock Halls in my first...,three_stars,False,True,False
6,Short version:\n- 20 stars for having free vod...,two_stars,False,False,True
7,Okay.....read the other reviews about taking t...,one_star,True,False,False
8,"The food is really good, but we had one big co...",three_stars,False,True,False
9,"Cool place, comfortable. Sorely needed in the ...",three_stars,False,True,False


In [9]:
# # 1. Expand Contractions using contractions library
# def expand_contractions(text):
#     return contractions.fix(text)

# # Apply expand_contractions function to 'text' column
# labelled_data['text'] = labelled_data['text'].apply(expand_contractions)

# # 2. Lower Case
# labelled_data['text'] = labelled_data['text'].str.lower()

# # 3. Remove Punctuations
# PUNCT_TO_REMOVE = string.punctuation
# def remove_punctuation(text):
#     """custom function to remove the punctuation"""
#     return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

# labelled_data['text'] = labelled_data['text'].apply(lambda text: remove_punctuation(text))

# # 4. Remove Extra Spaces
# labelled_data['text'] = labelled_data['text'].apply(lambda x: re.sub(' +', ' ', x.strip()))

In [10]:
# # Download the WordNet resource
# nltk.download('stopwords')

In [11]:
# # 5. Remove Stopwords
# STOPWORDS = set(stopwords.words('english'))
# def remove_stopwords(text):
#     """custom function to remove the stopwords"""
#     return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# labelled_data['text'] = labelled_data['text'].apply(lambda text: remove_stopwords(text))

In [12]:
# # Download the WordNet resource
# nltk.download('wordnet')

In [13]:
# !unzip /home/ec2-user/nltk_data/corpora/wordnet.zip -d /home/ec2-user/nltk_data/corpora/

In [14]:
# # 6. Stemming and Lemmatization
# lemmatizer = WordNetLemmatizer()
# def lemmatize_words(text):
#     return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

# labelled_data['text'] = labelled_data['text'].apply(lambda text: lemmatize_words(text))

In [15]:
# labelled_data.head(10)

# Split Data

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(
    labelled_data['text'], 
    labelled_data[['one_star', 'two_stars', 'three_stars']], 
    test_size= 0.2, 
    random_state= 42, 
    stratify= labelled_data['stars']
)

In [17]:
print(y_train.value_counts())
print(y_valid.value_counts())

one_star  two_stars  three_stars
False     False      True           1600
          True       False          1600
True      False      False          1600
Name: count, dtype: int64
one_star  two_stars  three_stars
False     False      True           400
          True       False          400
True      False      False          400
Name: count, dtype: int64


In [18]:
# X_train

In [19]:
# y_train

In [20]:
train_ind = X_train.index.values.tolist()
valid_ind = X_valid.index.values.tolist()

train_data = {'text': [], 'one_star': [], 'two_stars': [], 'three_stars': []}
valid_data = {'text': [], 'one_star': [], 'two_stars': [], 'three_stars': []}

for ti in train_ind:
    train_data['text'].append(X_train[ti])
    train_data['one_star'].append(y_train.loc[ti]['one_star'])
    train_data['two_stars'].append(y_train.loc[ti]['two_stars'])
    train_data['three_stars'].append(y_train.loc[ti]['three_stars'])

for vi in valid_ind:
    valid_data['text'].append(X_valid[vi])
    valid_data['one_star'].append(y_valid.loc[vi]['one_star'])
    valid_data['two_stars'].append(y_valid.loc[vi]['two_stars'])
    valid_data['three_stars'].append(y_valid.loc[vi]['three_stars'])
    
print(len(train_data['text']))
print(len(valid_data['text']))

4800
1200


In [21]:
train_dataset = Dataset.from_dict(train_data)
valid_dataset = Dataset.from_dict(valid_data)
data = datasets.DatasetDict({"train": train_dataset,"valid": valid_dataset})

In [22]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'one_star', 'two_stars', 'three_stars'],
        num_rows: 4800
    })
    valid: Dataset({
        features: ['text', 'one_star', 'two_stars', 'three_stars'],
        num_rows: 1200
    })
})

In [23]:
data['train'][0]

{'text': "Remeber ten years ago, when Wired was publishing articles about how the Internet would end retail shopping?  Well, they were wrong.  Bad customer service will end retail shopping and this Best Buy is a fine example of the downward trend.  The people that work at this ubiquitous big box store are assholes that usually don't know anything about their products and could give two fucks about whether or not you're being helped.    \\r\\n\\r\\nThat said, man, do I love gadgets.  And Best Buy's got a lot of 'em.",
 'one_star': False,
 'two_stars': True,
 'three_stars': False}

In [24]:
labels = [label for label in data['train'].features.keys() if label not in ['text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['one_star', 'two_stars', 'three_stars']

In [25]:
id2label

{0: 'one_star', 1: 'two_stars', 2: 'three_stars'}

# Tokenizer

In [26]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [27]:
tokenized_data = data.map(preprocess_data, batched=True, remove_columns= data['train'].column_names)

Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [28]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4800
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1200
    })
})

In [29]:
tokenized_data['train'][0]['labels']

[0.0, 1.0, 0.0]

In [30]:
tokenized_data.set_format("torch")

# Data Collator

In [31]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Model Building

In [32]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train Model Pt.1

In [33]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

In [34]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    for i, row in enumerate(probs): 
        y_pred[i, np.argmax(row.detach().numpy())] = 1 
    # finally, compute metrics
    y_true = labels
    # f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    # roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    # metrics = {'f1': f1_micro_average,
    #            'roc_auc': roc_auc,
    #            'accuracy': accuracy}
    metrics = {'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

# Verify a batch and forward pass

In [35]:
tokenized_data['train'][0]['labels'].type()

'torch.FloatTensor'

In [36]:
tokenized_data['train']['input_ids'][0]

tensor([    0, 31157,   242,  1943,  2724,   107,   536,     6,    77, 36572,
           21, 10467,  7201,    59,   141,     5,  3742,    74,   253,  2304,
         3482,   116,  1437,  2647,     6,    51,    58,  1593,     4,  1437,
         5654,  2111,   544,    40,   253,  2304,  3482,     8,    42,  2700,
         4228,    16,    10,  2051,  1246,     9,     5, 14659,  2904,     4,
         1437,    20,    82,    14,   173,    23,    42, 25107,   380,  2233,
         1400,    32,  8446, 31670,    14,  2333,   218,    75,   216,   932,
           59,    49,   785,     8,   115,   492,    80,   856, 19667,    59,
          549,    50,    45,    47,   214,   145,  1147,     4,  1437,  1437,
         1437, 44128,   338, 37457,   282, 37457,   338, 37457,   282,  1711,
           26,     6,   313,     6,   109,    38,   657, 21485,     4,  1437,
          178,  2700,  4228,    18,   300,    10,   319,     9,   128,   991,
            4,     2,     1,     1,     1,     1,     1,     1])

In [37]:
#forward pass
outputs = model(input_ids=tokenized_data['train']['input_ids'][0].unsqueeze(0), labels=tokenized_data['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7314, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0314, -0.2000, -0.0119]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

# # Train Model Round 1

In [37]:
batch_size = 10
metric_name = "accuracy"

In [39]:
args = TrainingArguments(
    f"roberto-round-1",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5, # 2e-5
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    # weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    warmup_steps= 100,
    lr_scheduler_type= 'constant'
    #push_to_hub=True,
)

In [42]:
from typing import Dict

class MyTrainer(Trainer):
    def log(self, logs: Dict[str, float]) -> None:
        logs["learning_rate"] = self._get_learning_rate()
        super().log(logs)

In [41]:
trainer = MyTrainer(
    model,
    args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Rate
1,No log,0.457138,0.65,2e-05
2,0.515300,0.464598,0.643333,2e-05
3,0.430100,0.471932,0.655,2e-05
4,0.352100,0.608199,0.668333,2e-05
5,0.285800,0.668092,0.6575,2e-05
6,0.250300,0.799083,0.663333,2e-05
7,0.201200,0.875607,0.6625,2e-05
8,0.186600,1.088807,0.621667,2e-05
9,0.149400,1.055743,0.664167,2e-05
10,0.132700,1.073924,0.664167,2e-05


TrainOutput(global_step=4800, training_loss=0.2686312572161357, metrics={'train_runtime': 948.271, 'train_samples_per_second': 50.618, 'train_steps_per_second': 5.062, 'total_flos': 3157361012736000.0, 'train_loss': 0.2686312572161357, 'learning_rate': 2e-05, 'epoch': 10.0})

# Train Model Round 2

In [38]:
batch_size = 10
metric_name = "accuracy"

In [39]:
model2 = AutoModelForSequenceClassification.from_pretrained('roberto-round-1/checkpoint-4800')

In [40]:
args2 = TrainingArguments(
    f"roberto-round-2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-6, # 2e-5
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    # weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    warmup_steps= 100,
    lr_scheduler_type= 'constant'
    #push_to_hub=True,
)

In [43]:
trainer = MyTrainer(
    model2,
    args2,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Rate
1,No log,1.173095,0.655833,2e-06
2,0.052200,1.202227,0.660833,2e-06
3,0.032400,1.249877,0.653333,2e-06
4,0.034500,1.213973,0.664167,2e-06
5,0.028700,1.269762,0.655833,2e-06
6,0.029000,1.279936,0.660833,2e-06



KeyboardInterrupt



# Save Model

In [None]:
# model.save_pretrained('models')

In [57]:
trainer.evaluate()

{'eval_loss': 0.7568681240081787,
 'eval_f1': 0.6399331662489557,
 'eval_roc_auc': 0.7299999999999999,
 'eval_accuracy': 0.635}

# Inference

In [42]:
test_data = pd.read_csv(test_data_path)
test_data.head(5)

Unnamed: 0,ID,text
0,1,\n\nDuring my recent company trip to our Tempe...
1,2,\n\nAfter hearing about Pauly D from Jersey Sh...
2,3,\n\nI had high hopes for this restaurant based...
3,4,We experienced overpriced and underwhelming en...
4,5,\n\nThe windows from this company are definite...


In [43]:
output_path = 'roberta_base_v10.csv'

In [44]:
count = 0
error_ind = []
with open(output_path, "w", newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["ID","Label"])
    for index, row in test_data.iterrows():
        text_id = row['ID']
        text = row['text']
        encoding = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
        encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
        # print(encoding)
        # print(encoding['attention_mask'].size())
        try:
            outputs = trainer.model(**encoding)
        except: 
            count += 1
            error_ind.append(text_id)
        logits = outputs.logits
        # apply sigmoid + threshold
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits.squeeze().cpu())
        predictions = np.zeros(probs.shape)
        predictions[np.argmax(probs.detach().numpy())] = 1
        # turn predicted id's into actual label names
        predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
        
        if predicted_labels[0] == 'one_star':
            conv_pred = 1
        elif predicted_labels[0] == 'two_stars':
            conv_pred = 2
        else: 
            conv_pred = 3

        
        csv_writer.writerow([text_id,conv_pred])
    print(count)

0
