In [1]:
#Create comments
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Loading data
dataset = pd.read_csv('Preprocessing_Wesley/processed_data/all_datasets/dataset.csv', compression='zip')
dataset = dataset.rename(columns={'emotion': 'label', 'sentence': 'text'})
dataset.columns

Index(['text', 'label'], dtype='object')

In [3]:
#Encode labels from strings to numbers
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dataset['label'] = label_encoder.fit_transform(dataset['label'])
num_labels = len(label_encoder.classes_)

In [4]:
dataset.head()

Unnamed: 0,text,label
0,Test to predict breast cancer relapse is approved,2
1,"Two Hussein allies are hanged, Iraqi official ...",3
2,Sights and sounds from CES,2
3,Schuey sees Ferrari unveil new car,2
4,Closings and cancellations top advice on flu o...,4


In [5]:
#Sample dataset to lower training time
dataset_sample = dataset.sample(n=100, random_state=42)

In [6]:
#Define tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#Tokenize text data
def tokenize_data(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, return_tensors='pt')

In [7]:
#Split dataset into train and test sets
train_dataset, test_dataset = train_test_split(dataset_sample, test_size=0.2, random_state=42)

In [8]:
#Convert data to Dataset format
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [9]:
#Tokenize datasets
train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 629.78 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 769.05 examples/s]


In [10]:
#Define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(dataset['label'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='./results',
    num_train_epochs=3,
    logging_dir='./logss',
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="steps",
    report_to="tensorboard",
)

#Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

#Train the model on the dataset
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 30/30 [18:52<00:00, 37.74s/it]

{'train_runtime': 1132.2134, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.026, 'train_loss': 1.5456157684326173, 'epoch': 3.0}





TrainOutput(global_step=30, training_loss=1.5456157684326173, metrics={'train_runtime': 1132.2134, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.026, 'train_loss': 1.5456157684326173, 'epoch': 3.0})