In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


In [None]:
kr_train_path = '/content/drive/MyDrive/Research/Uni of Pretoria /KINNEWS/kr_train_clean.csv'
kinn_train_path = '/content/drive/MyDrive/Research/Uni of Pretoria /KINNEWS/Copy of train.csv'
kinn_test_path = '/content/drive/MyDrive/Research/Uni of Pretoria /KINNEWS/Copy of test.csv'
embd_path =  '/content/drive/MyDrive/Research/Uni of Pretoria /KINNEWS/W2V-Kin-100.txt'

In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
!pip install transformers[torch]



In [None]:
df = pd.read_csv(kr_train_path)
df.rename(columns={'tweet': 'text'}, inplace=True)
df.head()

Unnamed: 0,text,label
0,Hhhhhh ntabyihogoza ubu x abo yishe ban...,negative
1,Amahano Ni impanuka inkangu inzara Muyite izi...,negative
2,Ese umuntu aguhaye miliyoni 7 zidorali ngo ary...,negative
3,Ugira amagambo kandi Ubwo wasanga nawe byaguta...,negative
4,Ukuntu inama zose zikomeye zirikubera Mu Rwand...,negative


In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
df.tail()

Unnamed: 0,text,label
3297,Tugukunda kurusha mukobwa mwiza Amahoro ibyish...,2
3298,Sobanukirwa IBYIZA MASSAGE IFITEYE UMUBIRI ht...,2
3299,Mushobora kugira uruhare muri iki kiganiro mut...,2
3300,22 Ntuduhane mu bitwoshya Ahubwo udukize Umubi...,2
3301,Ni umuyobozi wintangarugero aho ageze ho...,2


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the input texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

In [None]:
class KinyarwandaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
train_dataset = KinyarwandaDataset(train_encodings, train_labels.tolist())
val_dataset = KinyarwandaDataset(val_encodings, val_labels.tolist())


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Research/Uni of Pretoria /KINNEWS/results',          # Output directory
    num_train_epochs=20,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset             # Evaluation dataset
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.1327
20,1.125
30,1.1333
40,1.0836
50,1.0564
60,1.0532
70,1.0735
80,1.0836
90,1.0979
100,1.0811


KeyboardInterrupt: 

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

Step,Training Loss,Validation Loss
10,1.1327,
20,1.125,
30,1.1333,
40,1.0836,
50,1.0564,
60,1.0532,
70,1.0735,
80,1.0836,
90,1.0979,
100,1.0811,


{'eval_loss': 1.090915560722351}


In [None]:
import pandas as pd

# Example test dataset
test_data = {
    'text': [
        "Iki gikombe kiraryoshye cyane.",       # positive
        "byiza cyane bidasanzwe",
        "Ntabwo nishimiye uburyo ikorwa.",      # negative
        "Ndatekereza ko iki gitekerezo ari kiza.", # positive
        "Uyu mukino urasanzwe.",                # neutral
        "Imvura irenze urugero ni ikibazo.",    # negative
        "Sinzi neza uko byagenze.",             # neutral
        "Ndabikunze cyane.",                    # positive
        "Birababaje.",                          # negative
        "Ntacyo bivuze cyane.",                 # neutral
        "Bishimishije rwose."                   # positive
    ],
    'label': [
        'positive', 'positive', 'negative', 'positive', 'neutral', 'negative',
        'neutral', 'positive', 'negative', 'neutral', 'positive'
    ]
}

# Create a DataFrame
test_df = pd.DataFrame(test_data)

# Display the test DataFrame
print(test_df)


                                       text     label
0            Iki gikombe kiraryoshye cyane.  positive
1                    byiza cyane bidasanzwe  positive
2           Ntabwo nishimiye uburyo ikorwa.  negative
3   Ndatekereza ko iki gitekerezo ari kiza.  positive
4                     Uyu mukino urasanzwe.   neutral
5         Imvura irenze urugero ni ikibazo.  negative
6                  Sinzi neza uko byagenze.   neutral
7                         Ndabikunze cyane.  positive
8                               Birababaje.  negative
9                      Ntacyo bivuze cyane.   neutral
10                      Bishimishije rwose.  positive


In [None]:
# Prepare the test dataset
test_texts = test_df['text'].tolist()
test_labels = label_encoder.transform(test_df['label'])

# Tokenize the test texts
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Create test dataset object
test_dataset = KinyarwandaDataset(test_encodings, test_labels.tolist())

# Make predictions
predictions, labels, _ = trainer.predict(test_dataset)

# Convert predictions to label indices
predicted_labels = np.argmax(predictions, axis=1)

# Convert indices to label names
predicted_label_names = label_encoder.inverse_transform(predicted_labels)

# Print some example predictions
for text, predicted_label in zip(test_texts[:5], predicted_label_names[:5]):
    print(f'Text: {text}\nPredicted Sentiment: {predicted_label}\n')


Step,Training Loss,Validation Loss
10,1.1327,
20,1.125,
30,1.1333,
40,1.0836,
50,1.0564,
60,1.0532,
70,1.0735,
80,1.0836,
90,1.0979,
100,1.0811,


Text: Iki gikombe kiraryoshye cyane.
Predicted Sentiment: neutral

Text: byiza cyane bidasanzwe
Predicted Sentiment: neutral

Text: Ntabwo nishimiye uburyo ikorwa.
Predicted Sentiment: neutral

Text: Ndatekereza ko iki gitekerezo ari kiza.
Predicted Sentiment: neutral

Text: Uyu mukino urasanzwe.
Predicted Sentiment: neutral

