In [15]:
import pandas as pd
import datasets
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os

os.environ["WANDB_DISABLED"] = "true"

In [16]:
from huggingface_hub import login

#get token from pass.secret
with open("pass.secret", "r") as f:
    token = f.read()
    token = token.strip()

#login to huggingface
login(token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to C:\Users\jfitz\.huggingface\token
Login successful


In [17]:
# get data and add column names
data = pd.read_csv("../testdata/data_combined_24.csv", delimiter=",", header=None, names=["label", "text"])

#randomly shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

#split into train and test (90/10)
train_data = data[:int(len(data)*0.9)]
test_data = data[int(len(data)*0.9)+1:]
print(test_data.head())
train_data.head()

               label                                               text
32194  entertainment              michelle yeoh laughs yes finally cool
32195       medicine  slovenia in slovenia medical graduates after s...
32196     philosophy  spanish philosopher gómez pereira in his 1554 ...
32197          music  billie jean won two grammy awards best rb song...
32198           news      utilities triple battery storage capacity eia


Unnamed: 0,label,text
0,mathematics,parametricaly describe roughness printed cupol...
1,technology,citeseerx 10116599379 doi101177146499341001100303
2,geography,accurately determine new coordinates dms point...
3,sports,rocker boards offer only one degree of movemen...
4,entertainment,in japan during the edo period flatulists were...


In [18]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

KeyboardInterrupt: 

In [None]:
# tokenize the data adding padding so that all sequences are the same length
t_train_data = tokenizer(train_data.text.tolist(), truncation=True, padding=True)
t_test_data = tokenizer(test_data.text.tolist(), truncation=True, padding=True)

print(t_train_data.keys())


dict_keys(['input_ids', 'attention_mask'])


In [None]:
# get all input_ids
train_input_ids = []
train_attention_masks = []

for i in range(len(t_train_data["input_ids"])):
    train_input_ids.append(t_train_data['input_ids'][i])
    train_attention_masks.append(t_train_data['attention_mask'][i])

test_input_ids = []
test_attention_masks = []

for i in range(len(t_test_data["input_ids"])):
    test_input_ids.append(t_test_data['input_ids'][i])
    test_attention_masks.append(t_test_data['attention_mask'][i])




[0, 35901, 21612, 461, 9857, 27711, 3763, 13617, 2489, 337, 463, 5378, 1043, 1071, 225, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
# get all unique topics
topics = data['label'].unique()

# create mapping from topic to integer
topic_to_int = {}
for i in range(len(topics)):
    topic_to_int[topics[i]] = i

# create mapping from integer to topic
int_to_topic = {}
for i in range(len(topics)):
    int_to_topic[i] = topics[i]

In [None]:
# train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
# test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# create np array combining input_ids and attention_mask and label
train_data_np = np.array([train_input_ids, train_attention_masks, train_data['label']]).T
test_data_np = np.array([test_input_ids, test_attention_masks, test_data['label']]).T

final_train = pd.DataFrame(train_data_np, columns=['input_ids', 'attention_mask', 'label'])

final_test = pd.DataFrame(test_data_np, columns=['input_ids', 'attention_mask', 'label'])

# map label to integer
if type(final_train['label'][0]) == str:
    final_train['label'] = final_train['label'].map(lambda x: topic_to_int[x])

if type(final_test['label'][0]) == str:
    final_test['label'] = final_test['label'].map(lambda x: topic_to_int[x])


dataframe_train = datasets.Dataset.from_pandas(final_train)
dataframe_test = datasets.Dataset.from_pandas(final_test)




  train_data_np = np.array([train_input_ids, train_attention_masks, train_data['label']]).T
  test_data_np = np.array([test_input_ids, test_attention_masks, test_data['label']]).T


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    f"roberta-finetuned-topic",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=dataframe_train,
    eval_dataset=dataframe_test,
    # tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

c:\Users\jfitz\OneDrive\Documents\3rd year project\code\BERT\RoBERTa\roberta-finetuned-topic is already a clone of https://huggingface.co/MrFitzmaurice/roberta-finetuned-topic. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
trainer.train()

***** Running training *****
  Num examples = 32193
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 20125
  Number of trainable parameters = 124647170


  0%|          | 0/20125 [00:00<?, ?it/s]

KeyboardInterrupt: 