In [1]:
import os
import re

In [2]:
def list_files_in_directory(directory):
    files_list = []

    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)

        if os.path.isfile(full_path):
            files_list.append(entry)

    return files_list

def extract_number(filename):
    match = re.search(r'-?\d+', filename)
    return int(match.group()) if match else None

def find_reg_outlines(files, directory):
    numbered_files = [(extract_number(file), file) for file in files if (extract_number(file)) != None]
    
    opening_file_tuple = min(numbered_files, key=lambda x: x[0])
    closing_file_tuple = max(numbered_files, key=lambda x: x[0])
    
    numbered_files.remove(opening_file_tuple)
    opening_file_1_tuple = min(numbered_files, key=lambda x: x[0])
    
    numbered_files.remove(opening_file_1_tuple)
    opening_file_2_tuple = min(numbered_files, key=lambda x: x[0])
    
    opening_file = opening_file_tuple[1]
    opening_file_1 = opening_file_1_tuple[1]
    opening_file_2 = opening_file_2_tuple[1]
    closing_file = closing_file_tuple[1]
    
    dct = {}
    dct['opening'] = os.path.join(directory, opening_file)
    dct['opening_1'] = os.path.join(directory, opening_file_1)
    dct['opening_2'] = os.path.join(directory, opening_file_2)
    dct['closing'] = os.path.join(directory, closing_file)
    dct['body_outline'] = os.path.join(directory, '_.txt')
        
    return dct

## Teks

In [24]:
# ln/2019/pp4-2019bt      -->  PP_2019_4
# bn/2019/bn1047-2019     -->  Permen_Agama_2019_18
# perda/2019/perwal72019  -->  Perwali_Cirebon_2019_7

In [3]:
d = './dataset-final/new_split_txt/ln/2019/pp4-2019bt'
l = list_files_in_directory(d)
dct = find_reg_outlines(l, d)

with open(dct['body_outline'], "r", encoding="utf8") as file:
    lines = file.read()
    
print(lines)

BAB I
KETENTUAN UMUM 

Pasal 1
BAB II
KEDUDUKAN, TUGAS, DAN FUNGSI 

Pasal 2
Pasal 3
BAB III
ORGANISASI DAN KEANGGOTAAN 

Bagian Kesatu
Keanggotaan 

Pasal 4
Pasal 5
Pasal 6
Pasal 7
Bagian Kedua
Sekretariat 

Pasal 8
BAB IV
PENGANGKATAN DAN PEMBERHENTIAN ANGGOTA BPKN 

Pasal 9
Pasal 10
Pasal 11
Pasal 12
Pasal 13
Pasal 14
BAB V
PENDANAAN 

Pasal 15
Pasal 16
BAB VI
KETENTUAN PENUTUP 

Pasal 17
Pasal 18



## Turtle

In [8]:
import time

from rdflib import Graph

def extract_turtle(regulatory_id, directory):
    
    turtle_file_path = directory.replace('new_split_txt', 'new_5_turtle_files')
    turtle_file_path = turtle_file_path + '.ttl'
    
    try:
        g = Graph()
        g.parse(turtle_file_path, format='ttl')

        query = f"""
        prefix xsd: <http://www.w3.org/2001/XMLSchema#>
        prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        prefix dbo: <http://dbpedia.org/ontology/>
        prefix dct: <http://purl.org/dc/terms/>
        prefix owl: <http://www.w3.org/2002/07/owl#>
        prefix wd: <https://www.wikidata.org/wiki/>
        prefix lexid-s: <https://w3id.org/lex-id/schema/>
        prefix lexid: <https://w3id.org/lex-id/data/>

        construct {{
            ?z ?p ?o .
            ?o ?p1 ?o2 .
        }}
        where {{
            {{
                select distinct ?z ?p ?o
                where {{
                    lexid:{regulatory_id} (
                        dct:description 
                        | lexid-s:name  
                        | rdfs:label 
                        | rdf:type
                        | lexid-s:hasContent 
                        | lexid-s:isContentOf 
                        | lexid-s:hasPart 
                        | lexid-s:isPartOf
                    )* ?z .
                    ?z ?p ?o .
                    FILTER (?p NOT IN (rdf:type
                                        , owl:sameAs
                                        , lexid-s:adds
                                        , lexid-s:hasAdditionContent
                                        , lexid-s:hasAdditionTarget
                                        , lexid-s:deletes
                                        , lexid-s:modifies
                                        , lexid-s:hasModificationContent
                                        , lexid-s:hasModificationTarget
                                        , lexid-s:hasAct
                                        , lexid-s:hasActType
                                        , lexid-s:hasElement
                                        , lexid-s:hasCondition
                                        , lexid-s:hasModality
                                        , lexid-s:hasObject
                                        , lexid-s:hasQualifier
                                        , lexid-s:hasQualifierType
                                        , lexid-s:hasQualifierValue
                                        , lexid-s:hasRule
                                        , lexid-s:hasSubject
                                        , lexid-s:isRuleOf
                                        , lexid-s:refersTo
                                        , lexid-s:amendedBy
                                        , lexid-s:amends
                                        , lexid-s:implementedBy
                                        , lexid-s:implements
                                        , lexid-s:repealedBy
                                        , lexid-s:repeals
                                        , lexid-s:undefined
                                        , lexid-s:hasEnactionDate 
                                        , lexid-s:hasEnactionLocation 
                                        , lexid-s:hasEnactionOffice 
                                        , lexid-s:hasEnactionOfficial 
                                        , lexid-s:hasPromulgationDate 
                                        , lexid-s:hasPromulgationLocation 
                                        , lexid-s:hasPromulgationOffice 
                                        , lexid-s:hasPromulgationOfficial 
                                        , lexid-s:hasPromulgationPlace 
                                        , lexid-s:hasCreator 
                                        , lexid-s:hasDictum 
                                        , lexid-s:hasCreator 
                                        , lexid-s:regulationNumber 
                                        , lexid-s:regulationYear
                                        , lexid-s:considers 
                                        , lexid-s:hasLegalBasis 
                                        , lexid-s:LegalBasisOf 
                                        ))
                }}
            }}
            OPTIONAL {{
                ?o ?p1 ?o2 .
                FILTER (?p1 IN (rdf:type) && ?o2 NOT IN (owl:Thing
                                                        , lexid-s:LawAmandment
                                                        , lexid-s:Act
                                                        , lexid-s:AgencyRegulation
                                                        , lexid-s:AmendmentToTheConstitution
                                                        , lexid-s:CityOrdinance
                                                        , lexid-s:Constitution
                                                        , lexid-s:GovernmentRegulation
                                                        , lexid-s:GovernmentRegulationInLieuOfAct
                                                        , lexid-s:GovernorRegulation
                                                        , lexid-s:JointRegulation
                                                        , lexid-s:MayorRegulation
                                                        , lexid-s:MinisterialDecree
                                                        , lexid-s:MinisterialRegulation
                                                        , lexid-s:PeoplesConsultativeAssemblyResolution
                                                        , lexid-s:PresidentialDecree
                                                        , lexid-s:PresidentialRegulation
                                                        , lexid-s:ProvincialOrdinance
                                                        , lexid-s:RegencyOrdinance
                                                        , lexid-s:RegentRegulation
                                                        , lexid-s:LegalDocument
                                                        , lexid-s:LegalDocumentContent
                                                        , lexid-s:RuleExpression))
            }}
        }}
        """

        results = g.query(query)

        for result in results:
            subject = result[0].n3(g.namespace_manager)
            predicate = result[1].n3(g.namespace_manager)
            obj = result[2].n3(g.namespace_manager)
            res = F"{subject} {predicate} {obj} .\n"
            print(res)
        
    except Exception as e:
        print(e)

In [9]:
extract_turtle('PP_2019_4', d)

lexid:PP_2019_4_Section_14_3 lexid-s:name "(3)"^^xsd:string .

lexid:PP_2019_4_Section_9_1 lexid-s:isPartOf lexid:PP_2019_4_Article_9 .

lexid:PP_2019_4_Section_3_2_Letter_B rdf:type lexid-s:Item .

lexid:PP_2019_4_Section_8_3 dct:description """Fungsi, tugas, dan tata kerja sekretariat sebagaimana 
dimaksud pada ayat (1) diatur dalam keputusan Ketua 
BPKN."""^^xsd:string .

lexid:PP_2019_4_Chapter_II lexid-s:hasPart lexid:PP_2019_4_Article_3 .

lexid:PP_2019_4_Section_8_3 lexid-s:name "(3)"^^xsd:string .

lexid:PP_2019_4_Article_6_Letter_D dct:description "tidak pernah dihukum karena kejahatan."^^xsd:string .

lexid:PP_2019_4_Article_6 lexid-s:isPartOf lexid:PP_2019_4_Part_III_KESATU .

lexid:PP_2019_4_Section_4_1_Letter_A lexid-s:isPartOf lexid:PP_2019_4_Section_4_1 .

lexid:PP_2019_4_Article_1_Number_0 dct:description "Dalam Peraturan Pemerintah ini yang dimaksud dengan V1, V2, V3, V4, V5, dan V6"^^xsd:string .

lexid:PP_2019_4_Article_6_Letter_F rdf:type lexid-s:Item .

lexid:PP_20

## Intro

In [10]:
!nvidia-smi

Thu May 23 07:52:04 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:07:00.0 Off |                    0 |
| N/A   44C    P0            185W /  400W |   17379MiB /  40960MiB |     64%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          On  |   00

In [9]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [4]:
from huggingface_hub import login

login(token = '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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

MODEL_NAME = "google/codegemma-7b-it"

model_f7 = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    device_map="cuda", 
    torch_dtype="auto",
    trust_remote_code=True, 
).half()

tokenizer_f7 = AutoTokenizer.from_pretrained(MODEL_NAME)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [25]:
pipe_f7 = pipeline(
    "text-generation",
    model=model_f7,
    tokenizer=tokenizer_f7,
    framework="pt",
)

generation_args = {
    "max_new_tokens": 5000,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
#     "repetition_penalty": 1.7,
#     "bos_token_id": 11,
#     "eos_token_id": 11,
#     "pad_token_id": 11,
#     "use_cache": False,
}

## Body

# GPTQ

In [7]:
with open('./dataset-final/new_2_turtle_files/ln/2019/pp4-2019bt.ttl', 'r') as f:
    lines = f.readlines()

lines = lines[10:]

In [8]:
import torch

from transformers import AutoTokenizer, TextGenerationPipeline, AutoModelForCausalLM, pipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

CUDA extension not installed.
CUDA extension not installed.
