In [None]:
#### ================== POSTPONE ================== ####


import re
import pdfplumber
import docx2txt

In [2]:
def read_pdf(file) -> str:
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
        return text

def read_docx(file) -> str:
    return docx2txt.process(file)

def extract_info(text: str) -> dict:
    info = {
        "name": "",
        "email": "",
        "phone": "",
        "education": "",
        "experience": "",
    }

    email_match = re.search(r"[\w\.-]+@[\w\.-]+", text)
    phone_match = re.search(r"(\+62|08)[0-9\s\-]{8,}", text)

    if email_match:
        info["email"] = email_match.group(0)

    if phone_match:
        info["phone"] = phone_match.group(0)

    lines = text.strip().split("\n")

    for line in lines:
        if line.strip():
            info["name"] = " ".join(line.strip().split())[:3]
            break

    if "Universitas" in text or "SMA" in text:
        info["education"] = "Terdeksi pendidikan"

    if "PT" in text or "freelance" in text.lower():
        info["experience"] = "Terdeksi pengalaman kerja"

    return info

In [86]:
import ipywidgets as widget
from IPython.display import display, HTML
import pdfplumber
import io

upload = widget.FileUpload(
    accept='.pdf',
    multiple=False
)
display(upload)

FileUpload(value=(), accept='.pdf', description='Upload')

In [83]:
def read_pdf_from_upload(uploaded_file):
    file_content = uploaded_file['content']
    with pdfplumber.open(io.BytesIO(file_content)) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def full_text(text):
    display(HTML(f"""
        <div style='white-space: pre-wrap; font-family: monospace; 
                    max-height: 600px; overflow-y: auto; 
                    border: 1px solid #ddd; padding: 10px;'>
            {text}
        </div>
    """))

if upload.value:
    uploaded_file = upload.value[0]  
    pdf_text = read_pdf_from_upload(uploaded_file)
    full_text(pdf_text)
else:
    print("Silakan upload file PDF terlebih dahulu.")

In [84]:
import csv
import re
from pprint import pprint

def loadCityNamesFromCSV(filepath):
    cities = []
    with open(filepath, newline='', encoding='utf-8') as csvFile:
        reader = csv.reader(csvFile)
        for row in reader:
            if len(row) >= 2:
                city = row[1].strip()
                cities.append(city)
    return cities

def extract_address_from_lines(lines, cities):
    # Pola regex yang lebih spesifik untuk alamat Indonesia
    street_pattern = re.compile(r'(Jl\.?|Jalan|Perum|Perumahan|Komplek|Komp\.?|Gang|GG\.?)\s+[A-Za-z0-9\s.]+', re.IGNORECASE)
    number_pattern = re.compile(r'(No\.?|Blok|RT\s?\d*|RW\s?\d*|Kav\.?)\s?[A-Za-z0-9/]+', re.IGNORECASE)
    
    address_lines = []
    
    for i, line in enumerate(lines):
        line_clean = re.sub(r'\s+', ' ', line.strip())
        
        # Cari baris yang mengandung pola jalan atau nomor
        has_street = street_pattern.search(line_clean)
        has_number = number_pattern.search(line_clean)
        has_city = any(city.lower() in line_clean.lower() for city in cities)
        
        if has_street or (has_number and has_city):
            # Ambil maksimal 3 baris (sebelumnya, saat ini, sesudahnya)
            snippet = []
            
            # Coba ambil 1 baris sebelumnya jika mengandung nomor/blok
            if i > 0 and number_pattern.search(lines[i-1]):
                snippet.append(lines[i-1])
            
            snippet.append(line)
            
            # Coba ambil 1 baris sesudahnya jika mengandung kota
            if i < len(lines)-1 and any(city.lower() in lines[i+1].lower() for city in cities):
                snippet.append(lines[i+1])
            
            # Filter ketat - hanya baris yang benar-benar mengandung komponen alamat
            filtered = []
            for l in snippet:
                l_clean = re.sub(r'\s+', ' ', l.strip())
                if (street_pattern.search(l_clean) or 
                    number_pattern.search(l_clean) or 
                    any(city.lower() in l_clean.lower() for city in cities)):
                    filtered.append(l_clean)
            
            if filtered:
                address_lines = filtered
                break  # Hentikan setelah menemukan alamat pertama yang cocok
    
    if address_lines:
        # Gabungkan dan bersihkan alamat
        combined = ' '.join(address_lines).strip()
        
        # Normalisasi format
        combined = re.sub(r'(?i)\bJl\b\.?', 'Jl.', combined)  # Standarisasi Jl.
        combined = re.sub(r'(?i)\bJalan\b', 'Jalan', combined)
        combined = re.sub(r'(?i)\bNo\b\.?', 'No.', combined)
        combined = re.sub(r'(?i)\bBlok\b', 'Blok', combined)
        
        # Hapus karakter/tanda yang tidak perlu
        combined = re.sub(r'[^\w\s.,/-]', '', combined)  # Hanya pertahankan karakter alamat
        combined = re.sub(r'\s+', ' ', combined)  # Hapus spasi berlebih
        combined = re.sub(r',\s*,', ',', combined)  # Hapus koma ganda
        
        # Validasi akhir - pastikan mengandung minimal jalan dan kota
        if (street_pattern.search(combined) and 
            any(city.lower() in combined.lower() for city in cities)):
            return combined
    
    return ""

def extractCVData(pdf_text):
    lines = [line.strip() for line in pdf_text.split('\n') if line.strip()]

    result = {
        "name": lines[0],
        "position": lines[1],
        "contact": {},
        "summary": "",
        "experience": [],
        "education": [],
        "skills": [],
    }

    # Load kota dari CSV
    cities = loadCityNamesFromCSV("city.csv")

    # Ambil alamat
    address = extract_address_from_lines(lines, cities)
    if address:
        result['contact']['address'] = address

    # Regex kontak
    phone_regex = r'08\d{2}[\s-]?\d{4}[\s-]?\d{4}'
    email_regex = r'[\w\.-]+@[\w\.-]+'

    section = None
    buffer = []

    for line in lines:
        # Kontak
        if re.search(phone_regex, line):
            result['contact']['phone'] = re.search(phone_regex, line).group().replace(" ", "")
        if re.search(email_regex, line):
            result['contact']['email'] = re.search(email_regex, line).group()

        # Deteksi section
        if 'Pengalaman Kerja' in line:
            section = 'experience'
            continue
        elif 'Pendidikan' in line:
            section = 'education'
            continue
        elif 'Skill' in line:
            section = 'skills'
            continue

        # Simpan berdasarkan section
        if section == 'experience':
            if re.search(r'\d{4}', line):
                buffer.append(line)
            elif buffer:
                buffer[-1] += ' ' + line
        elif section == 'education':
            result['education'].append(line)
        elif section == 'skills':
            result['skills'].append(line)
        elif section is None and not result['summary']:
            result['summary'] += ' ' + line

    result['experience'] = buffer
    return result

In [85]:
cv_data = extractCVData(pdf_text)
pprint(cv_data)

{'contact': {'email': 'windasinaga89@gmail.com', 'phone': '081287263554'},
 'education': ['Mar 2022 - Jun 2023',
               'Customer Support',
               '2010-2013 Membalas chat via WhatsApp,chat e-commerce &',
               'Bina Sarana Informatika Menjawab pertanyaan saat offline di '
               'event',
               'Menjelaskan dan menawarkan program/produk',
               'Fakultas Ilmu Bahasa',
               'perusahaan pada customer',
               'Inggris',
               'Mengirim undangan secara broadcast dengan',
               'tamplate yg sudah disediakan'],
 'experience': [],
 'name': 'Asri Winda Lestari S.',
 'position': 'Customer service/Admin Gudang',
 'skills': ['Warehouse Admin',
            'Ms Word',
            'Bertanggung jawab atas keluar dan masuk barang',
            'Excel poroduksi',
            'Melakukan stokopname berkala dan membuat laporan',
            'Power Point',
            'stokopname',
            'Canva Melaporkan ketersed