In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Ekstraksi Metadata

In [None]:
from pathlib import Path
import re
import pandas as pd

# Konfigurasi direktori
input_dir = Path('/content/drive/MyDrive/Tugasbesar/data_clean/raw')
output_path = Path('/content/drive/MyDrive/Tugasbesar/data/processed/metadata_only.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)

# Fungsi untuk mendapatkan metadata
def get_document_metadata(content):
    # Ekstraksi nomor kasus
    case_number = re.search(r'(nomor|no)[\s:]*([^\n;,]*)', content, re.I)
    case_number = case_number.group(2).strip() if case_number else ''

    # Ekstraksi tanggal
    date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', content)
    document_date = date_match.group(1) if date_match else ''

    # Klasifikasi jenis kasus
    case_type = ''
    if 'pidana' in content.lower():
        case_type = 'Pidana'
    elif 'perdata' in content.lower():
        case_type = 'Perdata'

    # Ekstraksi pasal yang relevan
    articles = re.findall(r'pasal[\s:]+(\d+[a-z]*)', content, re.I)
    unique_articles = ', '.join(sorted(set(articles))) if articles else ''

    # Identifikasi pihak yang terlibat
    parties = ''
    party_match = re.search(r'antara\s+(.*?)\s+melawan\s+(.*?)[\.,\n]', content, re.I)
    if party_match:
        parties = f"{party_match.group(1).strip()} vs {party_match.group(2).strip()}"

    return case_number, document_date, case_type, unique_articles, parties

# Proses ekstraksi metadata
case_records = []

for idx, file in enumerate(sorted(input_dir.glob('*.txt')), 1):
    with open(file, 'r', encoding='utf-8') as f:
        document_text = f.read()

    metadata = get_document_metadata(document_text)

    case_records.append({
        'case_id': idx,
        'case_number': metadata[0],
        'date': metadata[1],
        'case_type': metadata[2],
        'articles': metadata[3],
        'parties': metadata[4],
        'full_text': document_text
    })

# Simpan hasil ekstraksi
metadata_df = pd.DataFrame(case_records)
metadata_df.to_csv(output_path, index=False)
print(f'✔ Metadata berhasil disimpan di: {output_path}')

✔ Metadata berhasil disimpan di: /content/drive/MyDrive/Tugasbesar/data/processed/metadata_only.csv


# Ekstraksi Konten Kunci

In [None]:
import pandas as pd
import re
from pathlib import Path

# Konfigurasi file
input_file = Path('/content/drive/MyDrive/Tugasbesar/data/processed/metadata_only.csv')
output_file = Path('/content/drive/MyDrive/Tugasbesar/data/processed/cases.csv')
output_file.parent.mkdir(parents=True, exist_ok=True)

# Baca dataset
cases_data = pd.read_csv(input_file)

# Fungsi ekstraksi fakta kasus
def get_case_facts(content):
    facts_pattern = r'menimbang\s+bahwa\s+(.{50,1000}?)\.'
    facts_match = re.search(facts_pattern, content, re.I)
    if facts_match:
        return facts_match.group(1).strip()

    alt_pattern = r'bahwa\s+(.{50,1000}?)\.'
    alt_match = re.search(alt_pattern, content, re.I)
    return alt_match.group(1).strip() if alt_match else ''

# Fungsi ekstraksi dasar hukum
def get_legal_arguments(content):
    decision_pattern = r'memutuskan\s+(.{50,1000}?)\.'
    decision_match = re.search(decision_pattern, content, re.I)
    if decision_match:
        return decision_match.group(1).strip()

    article_pattern = r'dasarkan\s+pasal\s+(.{20,500}?)\.'
    article_match = re.search(article_pattern, content, re.I)
    if article_match:
        return article_match.group(1).strip()

    statement_pattern = r'menyatakan\s+(.{50,1000}?)\.'
    statement_match = re.search(statement_pattern, content, re.I)
    return statement_match.group(1).strip() if statement_match else ''

# Tambahkan kolom baru
cases_data['case_summary'] = cases_data['full_text'].fillna('').apply(get_case_facts)
cases_data['legal_basis'] = cases_data['full_text'].fillna('').apply(get_legal_arguments)

# Simpan hasil
cases_data.to_csv(output_file, index=False)
print(f'✔ Ekstraksi konten penting selesai. File tersimpan di:\n{output_file}')


✔ Ekstraksi konten penting selesai. File tersimpan di:
/content/drive/MyDrive/Tugasbesar/data/processed/cases.csv


# Feature Engineering

In [None]:
import pandas as pd
import re
from pathlib import Path

# Konfigurasi path
source_file = Path('/content/drive/MyDrive/Tugasbesar/data/processed/cases.csv')
result_file = Path('/content/drive/MyDrive/Tugasbesar/data/processed/cases_features.csv')
result_file.parent.mkdir(parents=True, exist_ok=True)

# Baca data
legal_cases = pd.read_csv(source_file)

# Fitur 1: Ukuran dokumen
legal_cases['word_count'] = legal_cases['full_text'].fillna('').apply(lambda x: len(x.split()))

# Fitur 2: Kata kunci penting
important_terms = [
    'wanprestasi', 'gugatan', 'penggugat', 'tergugat',
    'putusan', 'perjanjian', 'pidana', 'narkotika', 'cerai'
]

def find_important_terms(text):
    text = text.lower()
    matched = [term for term in important_terms if term in text]
    return '|'.join(matched)

legal_cases['key_terms'] = legal_cases['full_text'].fillna('').apply(find_important_terms)

# Fitur 3: Pola tanya-jawab
def count_qa_patterns(text):
    qa_patterns = r'\b(apa|siapa|mengapa|bagaimana|karena|maka|oleh karena)\b'
    return len(re.findall(qa_patterns, text.lower()))

legal_cases['qa_patterns'] = legal_cases['full_text'].fillna('').apply(count_qa_patterns)

# Simpan hasil
legal_cases.to_csv(result_file, index=False)
print(f'✔ Proses pembuatan fitur selesai.\nFile hasil disimpan di: {result_file}')

✔ Proses pembuatan fitur selesai.
File hasil disimpan di: /content/drive/MyDrive/Tugasbesar/data/processed/cases_features.csv
