In [1]:
import pandas as pd

## Utilities

In [2]:
from utils.cleaning import remove_emojis

## Load data

In [3]:
df = pd.read_csv('data/100kdata.csv')
df.head()

Unnamed: 0,text,label
0,lentu la letati nka re thobela,neutral
1,ka mosa le mogau wa modimo re bone hlabo ya le...,positive
2,khadjom thobelafmyaka 015 ka mokopane mesong g...,negative
3,nobodysmakoti2 takatina1 bathandwayo ge go sel...,positive
4,powerfm987 tshegomoagi gabotse taba ya gore le...,negative


#### Loading the Model and Tokenizer

In [4]:
from transformers import BertModel, BertTokenizer

# Load the MBERT model
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

# Load the corresponding tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

  from .autonotebook import tqdm as notebook_tqdm


#### Preparing Your Text Data

In [5]:
# Prepare a sample text
text = [ instance for instance in df['text'].values ]
text = text[: 100] # use only 100

# # Tokenize the text
inputs = tokenizer(text, padding=True, return_tensors='pt')
print(inputs.keys())
# inputs

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


#### Generating Embeddings

In [6]:
# Get the embeddings
outputs = model(**inputs)

# The last hidden-state is the first element of outputs
last_hidden_state = outputs[0]
last_hidden_state.shape

torch.Size([100, 82, 768])

#### Using the Embeddings

In [None]:
# Example: get the average of the embeddings to represent the sentence
sentence_embedding = last_hidden_state.mean(dim=1)
print(sentence_embedding)

#### Fine-Tuning MBERT

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load a sequence classification version of MBERT
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')

# Suppose we have some training data in two tensors: inputs and labels
train_inputs = sentence_embedding # This should be your tensor of input data
train_labels = df['label'] # This should be your tensor of labels


# Step 1: Create a TrainingArguments object
training_args = TrainingArguments(
    output_dir="./output",   # Directory where model checkpoints and logs will be saved
    num_train_epochs=3,      # Number of training epochs
    per_device_train_batch_size=16,
    save_steps=500,          # Save model checkpoints every 500 steps
    save_total_limit=2       # Limit the total number of saved checkpoints
)

# Step 2: Create the Trainer
trainer = Trainer(
    model=model,                 # Your pre-trained model
    args=training_args,          # Training arguments
    train_dataset=train_inputs   # Training dataset (Dataset object or data collator)
)

# Step 3: Train the model
trainer.train()
