In [20]:
# Step 1: Extract Text from the PDF
!pip install pdfplumber



In [21]:
import pdfplumber

# Load the correct file path
pdf_path = "/Urhobo_Dictionary_by_Ebireri_Okrokoto_Ur.pdf"  # Update this if the path differs in Colab

all_text = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            all_text += page_text + "\n"

# Show a small portion to confirm
print(all_text[:2000])

U R H O B O TO E N G L I S H
D I C T I O N A R Y
by
EBIRERI OKROKOTO
URHOBO LANGUAGE INSTITUTE
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
TONE MARKS (FOUR IN ALL) d , r , m ,
~
1. ( `) d for lowered voice : Has a short stroke pointing left.
2.(ˊ ) m for raised voice : Has a short stroke pointing right.
3. ( ) Slur for drawing the voice on the sound of the vowel
~
and is represented by a wave-like mark on such vowel.
4.( ) nothing is for flat tone that represent middle voice.
THE ALPHABETS
A B CH D DJ E Ẹ F G GB GH GHW H HW I J K KP L M N O ọ
P PH R RH S SH T U V VW W Y Z ( VOWELS= A E Ẹ I O ọ U )
1
A
URHOBO WORDS ENGLISH MEANING
Abaka…………………… Grasshopper
Abe……………………… A species of water yam with red
colored leaves and tuber
Abe……………………… Guilty verdict
Abo……………………….. Wrestling. Also means HANDS - plural form
for hand called Ob
Abotu……………………. Group effort
2
Abamw

In [22]:
# Updated pattern to capture urhobo → english pairs
pattern = r"^([A-Za-zÀ-ÿ’ẹọĒēū́̀\-()!]+(?:\s+[A-Za-zÀ-ÿ’ẹọĒēū́̀\-()!]+)*)\s*[.·•‧…—-]{3,}\s*(.+)$"

# Use re.MULTILINE so we check each line individually
matches = re.findall(pattern, all_text, re.MULTILINE)

# Check result count
print(f"Total matches found: {len(matches)}")

# Preview first few
for i in range(min(10, len(matches))):
    print(f"Urhobo: {matches[i][0]} → English: {matches[i][1]}")


Total matches found: 648
Urhobo: U R H O B O TO E N G L I S H
D I C T I O N A R Y
by
EBIRERI OKROKOTO
URHOBO LANGUAGE INSTITUTE
Urhobo: A
URHOBO WORDS ENGLISH MEANING
Abaka → English: Grasshopper
Urhobo: Abe → English: A species of water yam with red
Urhobo: colored leaves and tuber
Abe → English: Guilty verdict
Urhobo: Abo → English: Wrestling. Also means HANDS - plural form
Urhobo: Abotu → English: Group effort
Urhobo: Abamwa → English: Rag ; torn piece of cloth
Urhobo: Ada → English: Outing ; out from home : Kpẹ ada = gone
Urhobo: Adja → English: Bat
Urhobo: Adjudju → English: Hand-held fan usually made from animal skin


In [23]:
# STEP 2.5: Filter and Fix the Matches
cleaned_entries = []
skip_words = ["URHOBO", "ENGLISH", "DICTIONARY", "INSTITUTE", "ALPHABETS", "TONE", "MEANING"]

last_urhobo = ""
last_english = ""

for urhobo, english in matches:
    # Clean up whitespace
    urhobo = urhobo.strip()
    english = english.strip()

    # Skip junk headers
    if any(word in urhobo.upper() for word in skip_words):
        continue
    if urhobo == "A":
        continue
    # If the English part is incomplete (ends with comma or short), store and wait to complete
    if len(english) < 6 or english[-1] in [',', ';', '-']:
        last_urhobo = urhobo
        last_english = english
        continue

    # If last_english exists, join it
    if last_urhobo:
        english = last_english + " " + english
        urhobo = last_urhobo
        last_urhobo = ""
        last_english = ""

    # Add to cleaned list
    cleaned_entries.append((urhobo, english))

# Preview cleaned
print(f"Cleaned entries: {len(cleaned_entries)}")
for i in range(10):
    print(f"{cleaned_entries[i][0]} → {cleaned_entries[i][1]}")


Cleaned entries: 466
Abe → A species of water yam with red
colored leaves and tuber
Abe → Guilty verdict
Abo → Wrestling. Also means HANDS - plural form
Abotu → Group effort
Abamwa → Rag ; torn piece of cloth
Ada → Outing ; out from home : Kpẹ ada = gone
Adja → Bat Hand-held fan usually made from animal skin
Afiotọ → Giant African rabbit
Afọrhe → Brain Bridge
Agbara → Chair King fisher


In [24]:
# Improved Fix for Overmerged English Definitions
final_entries = []

for urhobo, english in cleaned_entries:
    # Split if it looks like two dictionary entries got merged (e.g., "Chair King fisher")
    parts = re.split(r'(?<=[a-z])\s+(?=[A-Z])', english)  # split where lowercase is followed by uppercase
    if len(parts) > 1:
        # Add multiple entries for one Urhobo word (safer than duplicating Urhobo word)
        for part in parts:
            part = part.strip(" ;,")
            if part:
                final_entries.append((urhobo, part))
    else:
        final_entries.append((urhobo, english.strip(" ;,")))

# Final preview
print(f"Final cleaned entries: {len(final_entries)}")
for i in range(10):
    print(f"{final_entries[i][0]} → {final_entries[i][1]}")


Final cleaned entries: 599
Abe → A species of water yam with red
colored leaves and tuber
Abe → Guilty verdict
Abo → Wrestling. Also means
Abo → HANDS - plural form
Abotu → Group effort
Abamwa → Rag ; torn piece of cloth
Ada → Outing ; out from home : Kpẹ ada = gone
Adja → Bat
Adja → Hand-held fan usually made from animal skin
Afiotọ → Giant


In [25]:
# Final cleanup: remove broken lines, split if there's an embedded newline
fixed_entries = []

for urhobo, english in final_entries:
    # If there's a newline in English with a new Urhobo word after it
    if '\n' in english:
        parts = english.split('\n')
        first_part = parts[0].strip()
        for part in parts[1:]:
            # If part starts with an Urhobo-like word, split it
            subparts = re.split(r'\s+', part.strip(), maxsplit=1)
            if len(subparts) == 2:
                urhobo_new, english_new = subparts
                fixed_entries.append((urhobo_new.strip(), english_new.strip()))
        fixed_entries.append((urhobo.strip(), first_part))
    else:
        fixed_entries.append((urhobo.strip(), english.strip()))


In [26]:
df = pd.DataFrame(fixed_entries, columns=["urhobo", "english"])
df.to_csv("urhobo_english_corpus.csv", index=False)

In [27]:
# Checking
df.head(10)  # shows the first 10 rows
df.sample(10, random_state=1)  # shows 10 random rows for spot checking


Unnamed: 0,urhobo,english
442,Rho,Pre-cook ; Not totally done ; Preservative
286,Iporoma,Button
379,Oma,Body
522,Urhiẹ,Place of abode
200,Furhie,Quench ; put out fire
23,Ahe,Night owl
414,Phoro,Erase
241,Ghoro,Roam ; Moving around without
66,Ato,Desert
165,Erhuru,Shore ; Beach ; River’s edge


In [28]:
# total number of rows and missing values:
print(f"Total rows: {len(df)}")
print("Missing values:\n", df.isnull().sum())


Total rows: 599
Missing values:
 urhobo     0
english    0
dtype: int64


In [29]:
# Check for any empty Urhobo or English columns
df[(df["urhobo"] == "") | (df["english"] == "")]

Unnamed: 0,urhobo,english


In [30]:
# Download the CSV
from google.colab import files
files.download("urhobo_english_corpus.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>