# Fetching the dataset


In [82]:
# dependencies
import os
import pandas as pd
import numpy as np

In [83]:
# # fetching the dataset

# data = pd.read_parquet(
#     "hf://datasets/michsethowusu/twi-sentiments-corpus/data/train-00000-of-00001-871f629b5e597578.parquet"
# )

# print(f"len data1: {len(data)}")

In [84]:
data1 = pd.read_csv("./data/raw/verified_data.csv")
data2 = pd.read_csv("./data/raw/crowdsourced_data.csv")
data2.columns = ["English", "Akuapem Twi"]

In [85]:
data = pd.concat([data1, data2], ignore_index=True)

In [86]:
data.head()

Unnamed: 0,English,Akuapem Twi
0,What she lacks in charisma she makes up for wi...,Nea onni ho adwempa no de adwumaden na ɛba.
1,There was nothing I could do about it.,Na biribiara nni hɛ a metumi ayɔ
2,Kwaku saw John and Abena holding hands.,Kwaku hui se John ne Abena kurakura wɛn nsa.
3,Can you stay till 2:30?,So wubetumi atena ha akosi nnɛnmienu npaamu ad...
4,You haven't got much time.,Wonni mmre


In [87]:
data = data["Akuapem Twi"]

In [88]:
data.tail()

26112            M'adwenkyerɛ ne sɛ ohia ma adwendwen
26113      Mebɔtɔ no akyɛre dodo, ɛsɛ sɛ metɔ foforow
26114                              Hwan na obisaa wo?
26115    Yaw dwenee Adwoa ho anadwo no nyinaa a wanna
26116                                 Woresɛe wommere
Name: Akuapem Twi, dtype: object

In [89]:
data = data.dropna()

In [90]:
data.to_csv("./data/processed/twiSentences.csv")

# Preprocessing


In [91]:
import re
import unicodedata
import nltk
import pickle
import string
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\os\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\os\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [92]:
AKAN_CHARS = "aeiouɛɔbdfghjklmnprstwyŋ'"
AKAN_CHARS_SET = set(AKAN_CHARS)


def remAccents(text):
    """Preserve ɛ, ɔ, ŋ and strip other diacritics."""
    return "".join(
        t
        for t in unicodedata.normalize("NFD", text)
        if unicodedata.category(t) != "Mn" or t in "ɛɔƐƆŋŊ"
    )


def isAkan(word: str):
    """Check if the word contains only valid Akan characters."""
    return all(char in AKAN_CHARS_SET for char in word)


def cleanText(text: str):
    """
    remove all words with any non-Akan characters.
    Returns a cleaned sentence.
    """
    text = remAccents(text)
    text = text.lower()

    text = re.sub(r"[^a-zɛɔŋ'\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    words = text.split()
    stripped = [word.strip("'") for word in words]
    pure_words = [word for word in stripped if isAkan(word)]

    return " ".join(pure_words)


def preprocess(data):
    cleanedSentences = []
    for row in data:
        sentence = cleanText(row)
        if sentence:
            cleanedSentences.append(sentence)
    return cleanedSentences

In [93]:
cleanedData=preprocess(data=data)

In [94]:
cleanedData[-10:]

['mɛnkɔ mmeretiawa yi mu',
 'yaw akatua yɛ mede mmɔho mmiensa',
 'mempɛ wiemhyɛn mu akwantu',
 'na ɛsɛ sɛ metɔ ade biribi wɔ sotɔɔ no mu nanso mewerefiie',
 'misusuw nea enti a ɔreteetee saa no ho',
 "m'adwenkyerɛ ne sɛ ohia ma adwendwen",
 'mebɔtɔ no akyɛre dodo ɛsɛ sɛ metɔ foforow',
 'hwan na obisaa wo',
 'yaw dwenee adwoa ho anadwo no nyinaa a wanna',
 'woresɛe wommere']

In [95]:
print(f"length of cleaned data: {len(cleanedData)}")

length of cleaned data: 26115


In [96]:
def tokenize(sentences):
    tokenizedSentences = []
    for sentence in sentences:
        tok = word_tokenize(sentence)
        tokenizedSentences.append(tok)
    return tokenizedSentences
    

In [97]:
sentenceTokens = tokenize(cleanedData)

# Splitting data into train and test

In [98]:
import random

SEED = 2025
random.seed(SEED)
random.shuffle(sentenceTokens)

trainSize = int(len(sentenceTokens)*0.8)
trainData = sentenceTokens[0:trainSize]
testData = sentenceTokens[trainSize:]

In [99]:
with open("./data/processed/trainTokens.pkl", "wb") as f:
    pickle.dump(trainData,f)

In [100]:
with open("./data/processed/testTokens.pkl", "wb") as f:
    pickle.dump(testData, f)