In [1]:
import pandas as pd

## Utilities

In [43]:
from utils.cleaning import remove_emojis

## Load data

In [30]:
df = pd.read_csv('data/100kdata.csv')
df.head()

Unnamed: 0,text,label
0,lentu la letati nka re thobela,neutral
1,ka mosa le mogau wa modimo re bone hlabo ya le...,positive
2,khadjom thobelafmyaka 015 ka mokopane mesong g...,negative
3,nobodysmakoti2 takatina1 bathandwayo ge go sel...,positive
4,powerfm987 tshegomoagi gabotse taba ya gore le...,negative


#### Loading the Model and Tokenizer

In [31]:
from transformers import BertModel, BertTokenizer

# Load the MBERT model
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

# Load the corresponding tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

#### Preparing Your Text Data

In [54]:
# Prepare a sample text
text = [ instance for instance in df['text'].values ]
# text[: 100] # use only 100

# # Tokenize the text
inputs = tokenizer(text, padding=True, return_tensors='pt')
print(inputs.keys())
# inputs

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


#### Generating Embeddings

In [55]:
# Get the embeddings
outputs = model(**inputs)

# The last hidden-state is the first element of outputs
last_hidden_state = outputs[0]
last_hidden_state.shape

#### Using the Embeddings

In [34]:
# Example: get the average of the embeddings to represent the sentence
sentence_embedding = last_hidden_state.mean(dim=1)
print(sentence_embedding.shape)

torch.Size([2, 768])


#### Fine-Tuning MBERT

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load a sequence classification version of MBERT
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')

# Suppose we have some training data in two tensors: inputs and labels
train_inputs = sentence_embedding # This should be your tensor of input data
train_labels = df['label'] # This should be your tensor of labels

# Set up the trainer
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3)
trainer = Trainer(model=model, args=training_args, train_dataset=train_inputs, labels=train_labels)

# Train the model
trainer.train()