# References

KINLP at SemEval-2023 Task 12: Kinyarwanda Tweet Sentiment Analysis

https://aclanthology.org/2023.semeval-1.98.pdf

https://huggingface.co/DigitalUmuganda/sentiment_analysis_kinyarwanda/tree/main

https://github.com/Andrews2017/KINNEWS-and-KIRNEWS-Corpus/tree/main?tab=readme-ov-file

# Importing Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd
import transformers
import os
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, pipeline

# KinyaBERT Model


In [None]:
!git clone https://huggingface.co/DigitalUmuganda/sentiment_analysis_kinyarwanda

Cloning into 'sentiment_analysis_kinyarwanda'...
remote: Enumerating objects: 10, done.[K
remote: Total 10 (delta 0), reused 0 (delta 0), pack-reused 10 (from 1)[K
Unpacking objects: 100% (10/10), 1.83 KiB | 98.00 KiB/s, done.


In [None]:
model_dir = '/content/sentiment_analysis_kinyarwanda'
model_name = 'DigitalUmuganda/sentiment_analysis_kinyarwanda'
config_path ='/content/sentiment_analysis_kinyarwanda/config.json'
model_path = '/content/sentiment_analysis_kinyarwanda/pytorch_model.bin'
tokenizer_name = 'bert-base-uncased'

In [None]:
config = AutoConfig.from_pretrained(config_path)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

Some weights of the model checkpoint at /content/sentiment_analysis_kinyarwanda were not used when initializing BertForSequenceClassification: ['classifier.0.bias', 'classifier.0.weight', 'classifier.2.bias', 'classifier.2.weight', 'classifier.5.bias', 'classifier.5.weight', 'classifier.7.bias', 'classifier.7.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/sentiment_analysis_kinyarwanda and are newly initialized: ['classifier.bias', 'cla

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
text = "Urugero rw'inkuru y'ibyishimo mu Kinyarwanda"

result = sentiment_pipeline(text)
print(result)

[{'label': 'LABEL_1', 'score': 0.589431881904602}]


# Kinyarwanda Dataset AfriSemEval

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Research/Uni of Pretoria /KINNEWS/kr_train.tsv', sep='\t')
df.head()

Unnamed: 0,ID,tweet,label
0,kr_train_00001,@user @user @user @user @user @user @user Hhhh...,negative
1,kr_train_00002,"@user Amahano?! Ni impanuka, inkangu, inzara.....",negative
2,kr_train_00003,Ese umuntu aguhaye miliyoni 7 zidorali ngo ary...,negative
3,kr_train_00004,Ugira amagambo😏 kandi Ubwo wasanga nawe byagut...,negative
4,kr_train_00005,Ukuntu inama zose zikomeye zirikubera Mu Rwand...,negative


In [None]:
df.drop(labels=['ID'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,tweet,label
0,@user @user @user @user @user @user @user Hhhh...,negative
1,"@user Amahano?! Ni impanuka, inkangu, inzara.....",negative
2,Ese umuntu aguhaye miliyoni 7 zidorali ngo ary...,negative
3,Ugira amagambo😏 kandi Ubwo wasanga nawe byagut...,negative
4,Ukuntu inama zose zikomeye zirikubera Mu Rwand...,negative


In [None]:
!pip install demoji

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m338.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [None]:
import re
import demoji

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'@user', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = demoji.replace(text,'')
    return text

In [None]:
df['tweet'] = df['tweet'].apply(preprocess)
df

Unnamed: 0,tweet,label
0,hhhhhh ntabyihogoza ubu x abo yishe ban...,negative
1,amahano ni impanuka inkangu inzara muyite izi...,negative
2,ese umuntu aguhaye miliyoni zidorali ngo arya...,negative
3,ugira amagambo kandi ubwo wasanga nawe byaguta...,negative
4,ukuntu inama zose zikomeye zirikubera mu rwand...,negative
...,...,...
3297,tugukunda kurusha mukobwa mwiza amahoro ibyish...,positive
3298,sobanukirwa ibyiza massage ifiteye umubiri ht...,positive
3299,mushobora kugira uruhare muri iki kiganiro mut...,positive
3300,ntuduhane mu bitwoshya ahubwo udukize umubi k...,positive


# Running the Model

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def sentiment_score(text):
  result = sentiment_pipeline(text)
  if result[0]['score']>.5: return 1
  else: return 0
  # return result

In [None]:
sentiment_score(df['tweet'].iloc[3280])

1

In [None]:
df['predicted_sentiment'] = df['tweet'].apply(sentiment_score)
print(df.head())

                                               tweet     label  \
0         hhhhhh ntabyihogoza ubu x abo yishe ban...  negative   
1   amahano ni impanuka inkangu inzara muyite izi...  negative   
2  ese umuntu aguhaye miliyoni  zidorali ngo arya...  negative   
3  ugira amagambo kandi ubwo wasanga nawe byaguta...  negative   
4  ukuntu inama zose zikomeye zirikubera mu rwand...  negative   

   predicted_sentiment  
0                    1  
1                    1  
2                    1  
3                    1  
4                    1  


In [None]:
df.head()

Unnamed: 0,tweet,label,predicted_sentiment
0,hhhhhh ntabyihogoza ubu x abo yishe ban...,negative,1
1,amahano ni impanuka inkangu inzara muyite izi...,negative,1
2,ese umuntu aguhaye miliyoni zidorali ngo arya...,negative,1
3,ugira amagambo kandi ubwo wasanga nawe byaguta...,negative,1
4,ukuntu inama zose zikomeye zirikubera mu rwand...,negative,1


In [None]:
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['label'].map(label_map)

In [None]:
df.dropna(inplace=True)

In [None]:
df['label'].unique()

array([0., 1.])

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(df['label'], df['predicted_sentiment'])
print('Accuracy:', accuracy)

Accuracy: 0.4396088019559902
