Task 1: Implementation of tokenization methods without using the Libraries for Engish and Hausa Language

In [None]:
def whitespace_tokenization(sentence):
    return sentence.split()

def punctuation_based_tokenization(sentence):
    punctuations = '.,!?;:()[]{}"\''
    tokens = []
    word = ''
    for char in sentence:
        if char in punctuations:
            if word:
                tokens.append(word)
                word = ''
            tokens.append(char)
        else:
            word += char
    if word:
        tokens.append(word)
    return tokens

def character_tokenization(sentence):
    return list(sentence)

def custom_delimiter_tokenization(sentence, delimiter):
    return sentence.split(delimiter)

def regex_based_tokenization(sentence, pattern):
    import re
    return re.split(pattern, sentence)

def byte_pair_encoding(sentence, merge_operations):
    vocab = {}
    for word in sentence.split():
        word = ' '.join(list(word)) + ' </w>'
        vocab[word] = vocab.get(word, 0) + 1

    for merge_op in merge_operations:
        new_vocab = {}
        for word in vocab:
            new_word = word.replace(' '.join(merge_op), ''.join(merge_op))
            new_vocab[new_word] = vocab[word]
        vocab = new_vocab

    return list(vocab.keys())

# User Input
print("Welcome to the Tokenization Program!")
sentence_en = input("Enter a sentence in English: ")
sentence_ha = input("Enter a sentence in Hausa: ")
delimiter = input("Enter a custom delimiter for tokenization: ")
regex_pattern = input("Enter a regex pattern for tokenization (e.g., '[.!?]'): ")

# Tokenization Outputs
print("\nTokenization Results:")
print("\n--- English Sentence ---")
print("Original Sentence:", sentence_en)
print("1. Whitespace Tokenization:", whitespace_tokenization(sentence_en))
print("2. Punctuation-Based Tokenization:", punctuation_based_tokenization(sentence_en))
print("3. Character Tokenization:", character_tokenization(sentence_en))
print("4. Custom Delimiter Tokenization (Delimiter='{}'):".format(delimiter), custom_delimiter_tokenization(sentence_en, delimiter))
print("5. Regex-Based Tokenization (Pattern='{}'):".format(regex_pattern), regex_based_tokenization(sentence_en, regex_pattern))

print("\n--- Hausa Sentence ---")
print("Original Sentence:", sentence_ha)
print("1. Whitespace Tokenization:", whitespace_tokenization(sentence_ha))
print("2. Punctuation-Based Tokenization:", punctuation_based_tokenization(sentence_ha))
print("3. Character Tokenization:", character_tokenization(sentence_ha))
print("4. Custom Delimiter Tokenization (Delimiter='{}'):".format(delimiter), custom_delimiter_tokenization(sentence_ha, delimiter))
print("5. Regex-Based Tokenization (Pattern='{}'):".format(regex_pattern), regex_based_tokenization(sentence_ha, regex_pattern))

# Byte Pair Encoding Example
merge_ops_en = [('l', 'o'), ('H', 'e'), ('e', 'l'), ('l', 'o')]
merge_ops_ha = [('g', 'w'), ('w', 'a'), ('a', 'j'), ('j', 'i')]
print("\n--- Byte Pair Encoding (BPE) ---")
print("English Sentence BPE:", byte_pair_encoding(sentence_en, merge_ops_en))
print("Hausa Sentence BPE:", byte_pair_encoding(sentence_ha, merge_ops_ha))


Welcome to the Tokenization Program!
Enter a sentence in English: welcome to my home !
Enter a sentence in Hausa: barka da zuwa gida na
Enter a custom delimiter for tokenization: n
Enter a regex pattern for tokenization (e.g., '[.!?]'): !

Tokenization Results:

--- English Sentence ---
Original Sentence: welcome to my home !
1. Whitespace Tokenization: ['welcome', 'to', 'my', 'home', '!']
2. Punctuation-Based Tokenization: ['welcome to my home ', '!']
3. Character Tokenization: ['w', 'e', 'l', 'c', 'o', 'm', 'e', ' ', 't', 'o', ' ', 'm', 'y', ' ', 'h', 'o', 'm', 'e', ' ', '!']
4. Custom Delimiter Tokenization (Delimiter='n'): ['welcome to my home !']
5. Regex-Based Tokenization (Pattern='!'): ['welcome to my home ', '']

--- Hausa Sentence ---
Original Sentence: barka da zuwa gida na
1. Whitespace Tokenization: ['barka', 'da', 'zuwa', 'gida', 'na']
2. Punctuation-Based Tokenization: ['barka da zuwa gida na']
3. Character Tokenization: ['b', 'a', 'r', 'k', 'a', ' ', 'd', 'a', ' ', 'z',

Task 2: Implementation of Stemming  methods without using the Libraries for Engish and Hausa Language

In [None]:
# Stemming Methods
def porter_stemmer(word):
    # Simplified Porter Stemmer
    if word.endswith("ing") or word.endswith("ed") or word.endswith("es"):
        return word[:-3]
    elif word.endswith("s") and len(word) > 1:
        return word[:-1]
    return word

def snowball_stemmer(word):
    # Snowball-like Stemmer (Handle "ing", "ed", "es")
    if word.endswith("ing") or word.endswith("ed") or word.endswith("es"):
        return word[:-3]
    elif word.endswith("ly"):
        return word[:-2]
    return word

def suffix_stripping_stemmer(word):
    # Suffix stripping based on simple rules
    suffixes = ["ing", "ed", "es", "ly", "s"]
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def custom_rule_based_stemmer(word):
    # Custom rules for stemming
    if word.endswith("ies"):
        return word[:-3] + "y"
    elif word.endswith("ing"):
        return word[:-3]
    elif word.endswith("s") and len(word) > 1:
        return word[:-1]
    return word

# Lemmatization Methods
def rule_based_lemmatization(word):
    # Rule-based Lemmatization
    if word in ["running", "ran", "runs"]:
        return "run"
    elif word in ["jumps", "jumped", "jumping"]:
        return "jump"
    return word

def dictionary_based_lemmatization(word):
    # Dictionary-based Lemmatization
    lemma_dict = {
        "running": "run",
        "ran": "run",
        "jumps": "jump",
        "jumped": "jump",
        "children": "child",
        "better": "good",
        "worst": "bad"
    }
    return lemma_dict.get(word, word)

# Process input sentences
def process_sentence(sentence, stemming_function, lemmatization_function):
    words = sentence.split()
    stemmed_words = [stemming_function(word) for word in words]
    lemmatized_words = [lemmatization_function(word) for word in words]
    return stemmed_words, lemmatized_words

# User Input
print("Welcome to the Stemming and Lemmatization Program!")
sentence_en = input("Enter a sentence in English: ")
sentence_ha = input("Enter a sentence in Hausa: (Stemming only, lemmatization not supported): ")

# English Processing
print("\n--- English Stemming and Lemmatization ---")
print("Original Sentence:", sentence_en)

# Apply Stemming
for name, stemmer in [
    ("Porter Stemmer", porter_stemmer),
    ("Snowball Stemmer", snowball_stemmer),
    ("Suffix Stripping Stemmer", suffix_stripping_stemmer),
    ("Custom Rule-Based Stemmer", custom_rule_based_stemmer)
]:
    stemmed, lemmatized = process_sentence(sentence_en, stemmer, rule_based_lemmatization)
    print(f"\n{name}:")
    print("Stemmed Words:", stemmed)

# Apply Lemmatization
print("\n--- Lemmatization ---")
lemmatized_rule = [rule_based_lemmatization(word) for word in sentence_en.split()]
lemmatized_dict = [dictionary_based_lemmatization(word) for word in sentence_en.split()]
print("Rule-Based Lemmatization:", lemmatized_rule)
print("Dictionary-Based Lemmatization:", lemmatized_dict)

# Hausa Processing (Stemming Only)
print("\n--- Hausa Stemming ---")
print("Original Sentence:", sentence_ha)
stemmed_hausa = [suffix_stripping_stemmer(word) for word in sentence_ha.split()]
print("Stemmed Words:", stemmed_hausa)


Welcome to the Stemming and Lemmatization Program!
Enter a sentence in English: welcome to my world
Enter a sentence in Hausa: (Stemming only, lemmatization not supported): barka da zuwa duniya ta

--- English Stemming and Lemmatization ---
Original Sentence: welcome to my world

Porter Stemmer:
Stemmed Words: ['welcome', 'to', 'my', 'world']

Snowball Stemmer:
Stemmed Words: ['welcome', 'to', 'my', 'world']

Suffix Stripping Stemmer:
Stemmed Words: ['welcome', 'to', 'my', 'world']

Custom Rule-Based Stemmer:
Stemmed Words: ['welcome', 'to', 'my', 'world']

--- Lemmatization ---
Rule-Based Lemmatization: ['welcome', 'to', 'my', 'world']
Dictionary-Based Lemmatization: ['welcome', 'to', 'my', 'world']

--- Hausa Stemming ---
Original Sentence: barka da zuwa duniya ta
Stemmed Words: ['barka', 'da', 'zuwa', 'duniya', 'ta']
