# Fetching the dataset


In [1]:
# dependencies
import os
import pandas as pd
import numpy as np

In [2]:
# fetching the dataset

data = pd.read_parquet(
    "hf://datasets/michsethowusu/twi-sentiments-corpus/data/train-00000-of-00001-871f629b5e597578.parquet"
)

print(f"len data1: {len(data)}")

  from .autonotebook import tqdm as notebook_tqdm


len data1: 432647


In [3]:
data.head()

Unnamed: 0,Twi,sentiment,__index_level_0__
0,Nyansa mu na woyɛɛ ne nyinaa;,Positive,0
1,"alo yɛngɛ sone,",Negative,1
2,Wosɛe wɔn a wɔnni wo nokorɛ nyinaa.,Negative,2
3,Akatua bɛn na ɔde bɛma wɔn a wɔde gyidi som no?,Positive,3
4,mepɛ Onyankopɔn ho nimdeɛ sen ɔhyew afɔre.,Positive,4


In [4]:
data = data["Twi"]

In [5]:
data.head()

0                      Nyansa mu na woyɛɛ ne nyinaa;
1                                    alo yɛngɛ sone,
2                Wosɛe wɔn a wɔnni wo nokorɛ nyinaa.
3    Akatua bɛn na ɔde bɛma wɔn a wɔde gyidi som no?
4         mepɛ Onyankopɔn ho nimdeɛ sen ɔhyew afɔre.
Name: Twi, dtype: object

In [6]:
data.to_csv("./data/processed/twiSentences.csv")

# Preprocessing


In [7]:
import re
import unicodedata
import nltk
import pickle
import string
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\os\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\os\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
AKAN_CHARS = "aeiouɛɔbdfghjklmnprstwyŋ'"
AKAN_CHARS_SET = set(AKAN_CHARS)


def remAccents(text):
    """Preserve ɛ, ɔ, ŋ and strip other diacritics."""
    return "".join(
        t
        for t in unicodedata.normalize("NFD", text)
        if unicodedata.category(t) != "Mn" or t in "ɛɔƐƆŋŊ"
    )


def isAkan(word: str):
    """Check if the word contains only valid Akan characters."""
    return all(char in AKAN_CHARS_SET for char in word)


def cleanText(text: str):
    """
    remove all words with any non-Akan characters.
    Returns a cleaned sentence.
    """
    text = remAccents(text)
    text = text.lower()

    text = re.sub(r"[^a-zɛɔŋ'\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    words = text.split()
    stripped = [word.strip("'") for word in words]
    pure_words = [word for word in stripped if isAkan(word)]

    return " ".join(pure_words)


def preprocess(data):
    cleanedSentences = []
    for row in data:
        sentence = cleanText(row)
        if sentence:
            cleanedSentences.append(sentence)
    return cleanedSentences

In [9]:
cleanedData=preprocess(data=data)

In [10]:
cleanedData[-10:]

['about netwow',
 'awurade kasa song',
 'ngmaa fo yuori ky',
 'step into my life shwbee dwee be dibbi de dwea',
 'draw a wma dee',
 'obra wo gye den',
 'marina daniel wow no',
 'draw attention wsj',
 'key man kwadwo asamoah',
 'me tuu ne tonga naa']

In [11]:
print(f"length of cleaned data: {len(cleanedData)}")

length of cleaned data: 432380


In [12]:
def tokenize(sentences):
    tokenizedSentences = []
    for sentence in sentences:
        tok = word_tokenize(sentence)
        tokenizedSentences.append(tok)
    return tokenizedSentences
    

In [13]:
sentenceTokens = tokenize(cleanedData)

# Splitting data into train and test

In [14]:
import random

SEED = 2025
random.seed(SEED)
random.shuffle(sentenceTokens)

trainSize = int(len(sentenceTokens)*0.8)
trainData = sentenceTokens[0:trainSize]
testData = sentenceTokens[trainSize:]

In [15]:
with open("./data/processed/trainTokens.pkl", "wb") as f:
    pickle.dump(trainData,f)

In [16]:
with open("./data/processed/testTokens.pkl", "wb") as f:
    pickle.dump(testData, f)