In [1]:
import pandas as pd

In [2]:
data = pd.read_json('scrapped_dealls.json')
data['jobs_id'] = data.index

In [5]:
data[['job_desc_list', 'job_qualification_list']].to_csv('job_desc_qualification.csv')

In [3]:
def clean_job_text(text_list):
    """
    Cleans job descriptions and qualifications by processing each item in a list of strings
    to remove patterns like numbered bullets, special characters, and other non-letter 
    beginnings. Ensures the first character is a letter.
    
    Args:
        text_list: List of strings or a single string
        
    Returns:
        List of cleaned strings or a cleaned string
    """
    import re
    
    # If input is not a list or is NaN, return as is
    if not isinstance(text_list, list):
        return text_list
    
    cleaned_list = []
    
    for item in text_list:
        if not isinstance(item, str):
            cleaned_list.append(item)
            continue
            
        # Remove numbered bullets (e.g., "1. ", "2. ", "10. ")
        item = re.sub(r'^\d+\.\s+', '', item)
        
        # Remove Chinese-style numbering (e.g., "1、")
        item = re.sub(r'^\d+、\s*', '', item)
        
        # Remove colons at the beginning (e.g., ": Text")
        item = re.sub(r'^:\s+', '', item)
        
        # Remove periods at the beginning (e.g., ".Text")
        item = re.sub(r'^\.', '', item)
        
        # Remove Unicode WORD JOINER character (\u2060)
        item = item.replace('\u2060', '')
        
        # Remove special symbols at the beginning
        item = re.sub(r'^[📌♦\·•\-_\+=>#@\$%\^&~\|]+\s', '', item)
        
        # Remove leading spaces and invisible characters
        item = item.lstrip()
        
        # Ensure the string starts with a letter - if not, keep removing characters until it does
        while item and not item[0].isalpha():
            item = item[1:] if len(item) > 1 else ""
        
        cleaned_list.append(item)
    
    return cleaned_list


In [5]:
data['job_desc_list'] = data['job_desc_list'].apply(clean_job_text)
data['job_qualification_list'] = data['job_qualification_list'].apply(clean_job_text)

In [6]:
# Fungsi untuk menghitung jumlah karakter pada list of string
def count_chars(text_list):
    if isinstance(text_list, list):
        return len(' '.join(text_list))
    elif isinstance(text_list, str):
        return len(text_list)
    else:
        return 0

# Membuat kolom baru
data['job_desc_list_len'] = data['job_desc_list'].apply(count_chars)
data['job_qualification_list_len'] = data['job_qualification_list'].apply(count_chars)

In [11]:
data[['jobs_id', 'job_desc_list_len', 'job_desc_list', 'job_qualification_list_len', 'job_qualification_list']].to_csv('clean_data.csv')