# Tagging DKA and KA for PAWS

In [1]:
import calamancy
import pandas as pd

In [2]:
nlp = calamancy.load("tl_calamancy_md-0.2.0")
doc = nlp("Ako si Juan de la Cruz")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
tagger = calamancy.Tagger("tl_calamancy_md-0.2.0")



In [4]:
list(tagger("Pagdating ko sa Pilipinas ay pupunta akong Baguio."))

[('Pagdating', ('NOUN', 'NOUN')),
 ('ko', ('PRON', 'PRON__Case=Gen|Number=Sing|Person=1|PronType=Prs')),
 ('sa', ('ADP', 'ADP__Case=Loc')),
 ('Pilipinas', ('PROPN', 'PROPN')),
 ('ay', ('PART', 'PART')),
 ('pupunta', ('VERB', 'VERB__Aspect=Imp|Mood=Pot|Voice=Act')),
 ('akong', ('PRON', 'PRON_PART__Case=Nom|Number=Sing|Person=1|PronType=Prs')),
 ('Baguio', ('PROPN', 'PROPN')),
 ('.', ('PUNCT', 'PUNCT'))]

In [5]:
list(tagger("Madalas ay pumupunta siya rito."))

[('Madalas', ('VERB', 'VERB')),
 ('ay', ('PART', 'PART')),
 ('pumupunta', ('VERB', 'VERB__Aspect=Imp|Mood=Ind|Voice=Act')),
 ('siya', ('PRON', 'PRON__Case=Nom|Number=Sing|Person=3|PronType=Prs')),
 ('rito', ('PRON', 'PRON__Case=Loc|Deixis=Prox|PronType=Dem')),
 ('.', ('PUNCT', 'PUNCT'))]

In [6]:
list(tagger("Lahat ng bata ay magagaling."))

[('Lahat', ('ADJ', 'ADJ')),
 ('ng', ('ADP', 'ADP__Case=Gen')),
 ('bata', ('NOUN', 'NOUN')),
 ('ay', ('PART', 'PART')),
 ('magagaling', ('VERB', 'VERB')),
 ('.', ('PUNCT', 'PUNCT'))]

## Google Translations

In [7]:
df_paws = pd.read_csv("../datasets/translated/google/google_translated_paws.csv")
df_paws.head()

Unnamed: 0,id,sentence1,sentence2,label
0,29568,Ang bunganga ng Batten Kill ay nasa East Dorse...,Ang bunganga ng Batten Kill ay nasa East Dorse...,1
1,45829,Ang La tempestad ( International translation :...,Ang La Tempestad (Internasyonal na Salin: The ...,1
2,46990,"Mula noong 2006, nang mapasama si Josephine Al...",Si Cerljen din ang unang delegado mula sa Swed...,1
3,13893,Pinakasalan niya si Lady Florence Jane Taylor ...,Pinakasalan niya si Lady Florence Jane Taylour...,1
4,41986,Ang Elati ay isang nayon sa Kozani Regional Un...,Ang Elati ay isang nayon sa rehiyonal na yunit...,1


In [8]:
def get_sentence_form(text: str, tagger=tagger) -> int:
    
    data = list(tagger(text))
    first_index = next((i for i, (word, (pos, _)) in enumerate(data) if word == "ay" and pos == "PART"), None)

    # Logic to determine KA vs DKA
    if first_index is None:
        
        # SCENARIO A: No "ay" found
        print("Structure: Karaniwang Ayos (KA)")
        print("Reason: No inversion marker 'ay' detected.")
        return 1
        
    else:
        # SCENARIO B: "ay" found
        # We must ensure 'ay' isn't the very first word (which would be an interjection like "Ay! nauntog ako")
        if first_index > 0:
            print("Structure: Di-Karaniwang Ayos (DKA)")
            return 0
            
        else:
            print("Structure: Ambiguous (Likely KA with Interjection)")
            print("Reason: Found 'ay' but it was at the start of the sentence.")
            return 2

In [9]:
df_paws["sentence_1_form"] = df_paws["sentence1"].apply(get_sentence_form)

Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure:

In [10]:
df_paws["sentence_2_form"] = df_paws["sentence2"].apply(get_sentence_form)

Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Di-Karaniwang Ayos (DKA)
Structure: Karaniwang Ayos (KA)
Reason: No inversion marker 'ay' detected.
Structure: Di-Karaniwang Ayos (DKA)
Structu

In [11]:
df_paws.to_csv(r"../datasets/labeled/google/google_labeled_paws.csv", index=False)