In [18]:
# Import Libraries
import pandas as pd
import unicodedata
import os

In [19]:
# Ensure processed directory exists
os.makedirs("../data/processed", exist_ok=True)

In [20]:
# Load dataset (skip metadata rows)
df = pd.read_csv("../data/raw/raw_words.csv", skiprows=8)

In [21]:
# Assign consistent column names
df.columns = ["word","lemma","variants","pos","en_translation",
              "tl_translation","meaning","example_sentence","source_id"]

In [22]:
# Drop the header row
df = df.drop(0).reset_index(drop=True)

In [23]:
# Strip whitespace from all cells
df = df.apply(lambda col: col.map(lambda x: x.strip() if isinstance(x, str) else x))

In [24]:
# Lowercase normalization
for col in ["word","lemma","variants","pos","en_translation","tl_translation"]:
    df[col] = df[col].str.lower()

In [25]:
# Normalize Unicode (to NFC form so diacritics are consistent)
df = df.apply(lambda col: col.map(lambda x: unicodedata.normalize("NFC", x) if isinstance(x, str) else x))

In [26]:
# Standardize POS tags
pos_map = {
    "verb":"VERB",
    "noun":"NOUN",
    "adjective":"ADJ",
    "adverb":"ADV",
    "pronoun":"PRON",
    "conj":"CONJ","conjunction":"CONJ",
    "determiner":"DET",
    "preposition":"ADP",
    "particle":"PART",
    "numeral":"NUM",
}
df["pos"] = df["pos"].map(lambda x: pos_map.get(x, x.upper()))


In [27]:
# Fill missing values with placeholder
df = df.fillna("None")

In [28]:
# Load sources CSV
sources_df = pd.read_csv("../data/raw/sources.csv")

print(sources_df.columns)

Index(['source_id', 'details', 'date_collected', 'collector', 'link'], dtype='object')


In [29]:
# Merge on source_id to bring in the link
df = df.merge(sources_df, on="source_id", how="left")
df = df.drop(columns=["date_collected", "collector"])

In [30]:
# Create a nested 'source' column
df["source"] = df.apply(lambda row: {"source_id": row["source_id"], "link": row["link"]}, axis=1)

In [31]:
# Drop the original link/source_id columns
df = df.drop(columns=["source_id", "link", "details"])

In [32]:
# Save the cleaned dataset into ../data/processed
df.to_json("../data/processed/kapampangan_lexicon.json",
           orient="records", force_ascii=False, indent=2)

In [33]:
# Preview first 10 rows
df.head(10)

Unnamed: 0,word,lemma,variants,pos,en_translation,tl_translation,meaning,example_sentence,source
0,abu,abu,,NOUN,ashes,abo,The solid remnants of fires,"Mitu na ing kayu, atilu ne mu la ring abu","{'source_id': 'dict_001', 'link': 'https://acd..."
1,abung,abung,,NOUN,hut; shelter,kubo,Small simple dwelling,Ing talapanulu atin yang abung king talon.,"{'source_id': 'dict_006', 'link': 'https://siw..."
2,achi,achi,ate,NOUN,older sister,ate,Female sibling older than oneself,Achi ku ing sinaup king assignment ku.,"{'source_id': 'dict_005', 'link': 'https://kap..."
3,adua,adua,aduwa,NUM,two,dalawa,Number representing “2”,Aduang anak ing maki-sapatos.,"{'source_id': 'dict_005', 'link': 'https://kap..."
4,adwang dalan,adwang dalan,,NUM,two hundred,dalawang daan,Number representing “200”,Adwang dalan la reng manok.,"{'source_id': 'dict_005', 'link': 'https://kap..."
5,agpang,agpang,,ADJ,fitting; appropriate,angkop,Something that matches or suits,Agpang ya ing sapatu kaku.,"{'source_id': 'dict_006', 'link': 'https://siw..."
6,albug,a'lbug,albug,NOUN,flood,baha,A large overflow of water submerging land,Ing albug malalam ya,"{'source_id': 'dict_002', 'link': 'https://dok..."
7,albugan,albug,,NOUN,west,kanluran,The direction in which the sun goes down in th...,Ding anggang tau king babo ning mabilug a yatu...,"{'source_id': 'dict_003', 'link': 'https://kap..."
8,aldo,aldo,aldu,NOUN,day,araw,A unit of time equal to 24 hours.,Malino ya ing aldo,"{'source_id': 'dict_011', 'link': 'https://www..."
9,alili,alili,,NOUN,snail,suso,A mollusk with a coiled shell,Maragul la reng alili king sapa.,"{'source_id': 'dict_006', 'link': 'https://siw..."
