In [1]:
import re
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import Levenshtein
from pyjarowinkler import distance
from collections import defaultdict, Counter


# Dataset Import

In [2]:
file_paths = [
    r'D:\Kuliah\SEMESTER 8\Skripsi\Dataset\Sheet1_SID_DATA_COBORR_PASSANGER_1124.csv',
    r'D:\Kuliah\SEMESTER 8\Skripsi\Dataset\Sheet2_SID_DATA_COBORR_PASSANGER_1124.csv',
    r'D:\Kuliah\SEMESTER 8\Skripsi\Dataset\Sheet3_SID_DATA_COBORR_PASSANGER_1124.csv'
]
dtype_params = {
    'NO_AGGR': str,
    'NO_NPWP': str,
    'NO_KTP_KITAS': str,
    'NO_KTP_COBORR': str
}
parse_dates_params = ['DT_GOLIVE_VALID', 'TGL_LAHIR', 'TGL_LAHIR_COBORR']

df = pd.concat(
    [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
    ignore_index=True
)

  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],
  [pd.read_csv(file, dtype=dtype_params, parse_dates=parse_dates_params) for file in file_paths],


In [3]:
grouped_df = df[df['flag_PC'] == 'P']
grouped_df = grouped_df.reset_index(drop=True)

dataset = grouped_df.copy()

# Preprocessing & Cleansing

In [4]:
# DATA CLEANSING
def clean_and_validation(df):
    # Cleanse Nama Ibu Kandung
    df['cleaned_NAMA_IBU_KANDUNG'] = df['NAMA_IBU_KANDUNG'].apply(lambda x: str(x).upper() if pd.notna(x) else '')
    
    # Cleanse Tempat Lahir
    df['cleaned_TEMPAT_LAHIR'] = df['TEMPAT_LAHIR'].apply(lambda x: str(x).upper() if pd.notna(x) else '')
    # Remove 'CONVERTED', 'OTHERS', 'OTHER' and clean up single letter and repeated characters
    df['cleaned_TEMPAT_LAHIR'] = df['cleaned_TEMPAT_LAHIR'].apply(lambda x: '' if x in ['CONVERTED', 'OTHERS', 'OTHER'] else x)
    # Remove spaces in TEMPAT_LAHIR
    df['cleaned_TEMPAT_LAHIR'] = df['cleaned_TEMPAT_LAHIR'].str.replace(" ", "", regex=False)
    # Check for single letter and repeated characters after cleaning spaces
    df['cleaned_TEMPAT_LAHIR'] = df['cleaned_TEMPAT_LAHIR'].apply(lambda x: '' if re.match(r'^[A-Za-z]$', x) or re.match(r'^(.)\1*$', x) else x)
    
    df['TGL_LAHIR'] = pd.to_datetime(df['TGL_LAHIR'], errors='coerce')

    # 1. Remove prefixes
    def cleanse_name(name):
        if not isinstance(name, str):
            return ''
        
        # List of known prefixes (expand as needed)
        prefixes = ["IR.", "IR", "DR.", "DR", "PROF.", "PROF", "DRS.", "DRS", "DRA.", "DRA", "DRG", "DRG.", "HJ", "HJ.", "ALM", "ALM.", "ALMH", "ALMH."]
        
        # Regex to match one or more prefixes followed by spaces, symbols, or the end of the string
        prefix_pattern = r'^((?:\b' + '|'.join(map(re.escape, prefixes)) + r'\.?)\s*)+[\s\W_]+'
        
        # Remove the prefixes only when followed by a space, symbol, or the end of the string
        name = re.sub(prefix_pattern, '', name).strip()

        # Remove underscores as symbols
        name = name.replace('_', '')

        return name

    # Initialize cleaned_name column
    df['cleaned_name'] = df['NAME_GOLIVE'].apply(cleanse_name)
    df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(cleanse_name)

    # 2. Clean Title
    def create_pattern(title_list):
        patterns = []
        for title in title_list:
            pattern = r'\b' + r'\s*'.join([re.escape(part) for part in title]) + r'\b'
            patterns.append(pattern)
        return re.compile('|'.join(patterns), re.IGNORECASE)

    # Title lists
    title_4 = [
        ("S", "I", "K", "K"), ("S", "K", "P", "M")
    ]
    title_3 = [
        ("S", "I", "A"), ("S", "TR", "AK"), ("S", "E", "AS"), ("S", "A", "B"), 
        ("S", "PD", "B"), ("S", "TR", "BNS"), ("S", "BIS", "DIG"), ("S", "K", "G"), 
        ("S", "FIL", "H"), ("S", "H", "H"), ("S", "K", "H"), ("S", "PD", "H"), 
        ("S", "SOS", "H"), ("S", "TR", "HAN"), ("S", "E", "I"), ("S", "FIL", "I"), 
        ("S", "H", "I"), ("S", "KOM", "I"), ("S", "PD", "I"), ("S", "SOS", "I"), 
        ("S", "HUB", "INT"), ("S", "TR", "IP"), ("S", "I", "K"), ("S", "TR", "K"), 
        ("S", "TR", "KEB"), ("S", "I", "KOM"), ("S", "TR", "KOM"), ("S", "K", "L"), 
        ("S", "K", "M"), ("S", "A", "N"), ("S", "A", "P"), ("S", "I", "P"), 
        ("S", "T", "P"), ("S", "ST", "PI"), ("S", "TR", "PI"), ("S", "I", "PTK"), 
        ("S", "PD", "SD"), ("S", "PD", "SI"), ("S", "TR", "SOS"), ("S", "E", "SY"), 
        ("S", "TR", "T"), ("S", "SI", "TH"), ("M", "B", "A")
    ]
    title_2 = [
        ("S", "ADM"), ("S", "AG"), ("S", "AK"), ("S", "ANT"), ("S", "ARS"),
        ("S", "DES"), ("S", "DS"), ("S", "E"), ("S", "FARM"), ("S", "FIL"),
        ("S", "FT"), ("S", "GZ"), ("S", "H"), ("S", "HAN"), ("S", "HUM"),
        ("S", "HUT"), ("S", "IIP"), ("S", "IK"), ("S", "IN"), ("S", "IP"),
        ("S", "KEB"), ("S", "KED"), ("S", "KEL"), ("S", "KEP"), ("S", "KG"),
        ("S", "KOM"), ("S", "LI"), ("S", "M"), ("S", "MB"), ("S", "P"), ("I", "R"),
        ("S", "PAR"), ("S", "PD"), ("S", "PI"), ("S", "PN"), ("S", "PSI"),
        ("S", "PT"), ("S", "PTK"), ("S", "PWK"), ("S", "S"), ("S", "SI"),
        ("S", "SN"), ("S", "SOS"), ("S", "ST"), ("S", "STAT"), ("S", "STP"),
        ("S", "SY"), ("S", "T"), ("S", "TH"), ("S", "TI"), ("M", "T"), ("S", "E"), ("PH", "D"), ("S", "AKTR"),  ("M", "E")
    ]

    # Compile all patterns
    pattern_4char = create_pattern(title_4)
    pattern_3char = create_pattern(title_3)
    pattern_2char = create_pattern(title_2)

    def clean_name(name):
        if not isinstance(name, str):  # Ensure input is a string
            return ''

        # Remove non-word characters and symbols
        name = re.sub(r'[^\w\s]', '', name)
    

        # Remove numeric characters
        name = ''.join([char for char in name if not char.isdigit()])

        # Iteratively remove titles until no match is found
        while True:
            old_name = name
            name = pattern_4char.sub('', name)
            name = pattern_3char.sub('', name)
            name = pattern_2char.sub('', name)
            if name == old_name:  # Stop if no further changes
                break

        return name.strip()

    # Apply name cleaning
    df['cleaned_name'] = df['cleaned_name'].apply(clean_name)
    df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(clean_name)
    
    # remove first letter in name
    #df['cleaned_name'] = df['cleaned_name'].apply(lambda x: ' '.join(x.split()[1:]) if len(x.split()[0]) == 1 else x)
    #df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(lambda x: ' '.join(x.split()[1:]) if len(x.split()[0]) == 1 else x)
    df['cleaned_name'] = df['cleaned_name'].apply(lambda x: ' '.join(x.split()[1:]) if len(x.split()) > 1 and len(x.split()[0]) == 1 else x)
    df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(lambda x: ' '.join(x.split()[1:]) if len(x.split()) > 1 and len(x.split()[0]) == 1 else x)

    # 3. Clean NO_KTP_KITAS column
    df['cleaned_no_ktp'] = df['NO_KTP_KITAS'].astype(str).str.replace(r'\D', '', regex=True)

    # Function to clean the KTP values
    def clean_no_ktp(ktp):
        # If length is 1 or the value consists of repeating characters (e.g., "1111111111111111", "XXXX", etc.)
        if len(ktp) == 1 or len(set(ktp)) == 1:  
            return None  # Return None for invalid cases
        return ktp

    # Apply the cleaning function to the 'NO_KTP_KITAS' column
    df['cleaned_no_ktp'] = df['cleaned_no_ktp'].apply(clean_no_ktp)

    # 4. Clean NO_NPWP column
    df['cleaned_no_npwp'] = df['NO_NPWP'].apply(lambda x: ''.join(filter(str.isdigit, str(x))))


    # 5. Clean Address
    substitutions = [
        (r'\b(JLN\s|JLN\.|JALAN\s|JALAN\.|JL\.|JLH\s|JLH\.)\s?', 'JL '),
        (r'\b(GANG\s|GANG\.|GG\.)\s?', 'GG '),
        (r'\b(PONDOK\s|PONDOK\.|PD\.)\s?', 'PD '),
        (r'\b(PERUMAHAN\s|PERUMAHAN\.|PERUM\s|PERUM\.|PRM\.|PERUMNAS\s|PERUMNAS\.)\s?', 'PRM '),
        (r'\b(DESA\s|DESA\.|DUSUN\.|DUSUN\s|DS\.|DSN\.|DSN\s)\s?', 'DS '),
        (r'\b(KP\.|KPG\.|KPG\s|KAMPUNG\s|KAMPUNG\.|KAMP\s|KAMP\.|KMP\s|KMP\.)\s?', 'KP '),
        (r'\b(APARTEMEN\.|APARTEMEN\s|APART\.|APART\s|APARTMENT\.|APARTMENT\s|APARTEMENT\.|APARTEMENT\s|AP\s|AP\.|APT\.)\s?', 'APT '),
        (r'\b(KOMPLEKS\s|KOMPLEKS\.|KOMPLEK\s|KOMPLEK\.|KOMPL\s|KOMPL\.|KOMP\s|KOMP\.|KOM\.)\s?', 'KOM '),
        (r'\b(LINGKUNGAN\s|LINGKUNGAN\.|LINGK\s|LINGK\.|LKG\.)\s?', 'LKG '),
        (r'\b(TAMAN\s|TAMAN\.|TMN\s|TMN\.|TM\.)\s?', 'TM '),
        (r'\b(BLOK\.|BLOK\s|BLK\.|BLK\s|BL\.)\s?', 'BL '),
        (r'\b(VILLA\s|VILLA\.|VILA\s|VILA\.|VIL\.)\s?', 'VIL '),
        (r'\b(GRIYA\s|GRIYA\.|GRY\.)\s?', 'GRY '),
        (r'\b(ASRAMA\s|ASRAMA\.|ASR\.)\s?', 'ASR '),
        (r'\b(TANJUNG\s|TANJUNG\.|TJ\.)\s?', 'TJ ')
    ]

    # Known abbreviations with potential spacing issues
    long_abbreviations = ["APARTEMEN", "APARTEMENT", "APARTMENT", "TANJUNG", "LINGKUNGAN", "ASRAMA", 
                        "KOMPLEKS", "PERUMAHAN", "KAMPUNG", "PERUMNAS", "PONDOK"]

    # Function to add space between compound words
    def add_space_between_compound_words(address):
        for abbr in long_abbreviations:
            address = re.sub(rf'({abbr})([A-Z])', r'\1 \2', address)
        return address

    # Function to apply all substitutions globally
    def update_address(address):
        if not isinstance(address, str):
            return ''  # Handle non-string inputs gracefully
        
        # Correct compound abbreviations
        address = add_space_between_compound_words(address)
        
        # Apply all substitution patterns globally
        for pattern, replacement in substitutions:
            address = re.sub(pattern, replacement, address, flags=re.IGNORECASE)
        
        return address

    df['cleaned_alamat'] = df['ALMT_RUMAH'].apply(update_address)
    
    abbreviations_to_remove = [
        "JL", "GG", "PD", "PRM", "DS", "KP", "APT", "KOM", "LKG", "TM", "BL", "VIL", "GRY", "ASR", "TJ"
    ]

    # Regex pattern for matching Roman numerals
    roman_numeral_pattern = r'\b(M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}))\b'

    # Regex pattern to remove "NO" or "NOMOR" followed by any number and optional spaces
    no_nomor_pattern = r'\b(NO|NOMOR)\s?\d*\b'

    # Function to remove abbreviations, numbers, Roman numerals, and "NO" or "NOMOR" followed by numbers from address
    def deep_clean_address(address):
        if not isinstance(address, str):
            return ''  # Handle non-string inputs gracefully
        
        # Remove abbreviations
        for abbr in abbreviations_to_remove:
            address = re.sub(rf'\b{abbr}\b\s?', '', address, flags=re.IGNORECASE)
        
        # Remove all numbers
        address = re.sub(r'\d+', '', address)
        
        # Remove Roman numerals using regex
        address = re.sub(roman_numeral_pattern, '', address)
        
        # Remove "NO" or "NOMOR" followed by numbers
        address = re.sub(no_nomor_pattern, '', address)
        
        address = re.sub(r'[^a-zA-Z0-9\s]', '', address)
        
        address = re.sub(r'\s+', ' ', address).strip()
        
        return address
    df['deep_clean_address'] = df['cleaned_alamat'].apply(deep_clean_address)

    # Final fallback if name is empty after cleaning
    df['cleaned_name'] = df.apply(lambda row: row['cleaned_name'] if row['cleaned_name'] != '' else row['NAME_GOLIVE'], axis=1)    
    df['cleaned_NAMA_IBU_KANDUNG'] = df.apply(lambda row: row['cleaned_NAMA_IBU_KANDUNG'] if row['cleaned_NAMA_IBU_KANDUNG'] != '' else row['NAMA_IBU_KANDUNG'], axis=1) 
    
    # Remove 'BUNDA', 'UMI', 'IBU', 'BU', 'NY'
    df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(lambda x: '' if x in ['BUNDA', 'UMI', 'IBU', 'BU', 'NY', 'NYONYA'] else x)

    # Remove rows with a single letter or repeated characters for cleaned_NAMA_IBU_KANDUNG
    df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(
    lambda x: '' if isinstance(x, str) and (re.match(r'^[A-Za-z]$', x) or re.match(r'^(.)\1*$', x)) else x)

    def remove_extra_spaces(text):
        # Check if the input is a string or not
        if not isinstance(text, str):
            return ''  # Return an empty string for non-strings (e.g., NaN, float)
        return re.sub(r'\s+', ' ', text).strip()

    df['cleaned_name'] = df['cleaned_name'].apply(remove_extra_spaces)
    df['cleaned_NAMA_IBU_KANDUNG'] = df['cleaned_NAMA_IBU_KANDUNG'].apply(remove_extra_spaces)
    df['cleaned_alamat'] = df['cleaned_alamat'].apply(remove_extra_spaces)

    return df

# Apply the clean_and_validation function
dataset = clean_and_validation(dataset)
backup_dataset = dataset.copy()
dataset.replace('', np.nan, inplace=True)

## Sampling

In [5]:
sampled_data_100000 = pd.read_excel(r'D:\Kuliah\SEMESTER 8\Skripsi\Dataset\sampled_data_100000.xlsx')
sampled_data_100 = pd.read_excel(r'D:\Kuliah\SEMESTER 8\Skripsi\Dataset\sampled_data_100.xlsx')

# BIGRAM Blocking

In [6]:
# Function to generate bigrams of 2 characters from a name
def get_bigrams(name):
    name = name.replace(" ", "")  # Remove spaces
    return set("".join(pair) for pair in zip(name, name[1:]))

# Start measuring time
start_time = time.time()

# Step 1: Cache bigrams for each unique name
unique_cleaned_names = dataset["cleaned_name"].unique()

# Precompute bigrams for each name
bigrams_cache = {
    name: get_bigrams(name)
    for name in tqdm(unique_cleaned_names, desc="Caching bigrams")
}

# Step 2: Generate bigram groups and counts
bigram_groups = defaultdict(set)
bigram_counts = Counter()

# Process bigrams and build groups
for name, bigrams in tqdm(bigrams_cache.items(), desc="Processing bigrams"):
    for bigram in bigrams:
        bigram_counts[bigram] += 1
        bigram_groups[bigram].add(name)

# Step 3: Filter bigrams by frequency threshold
total_names = len(unique_cleaned_names)
bigram_frequency_threshold = total_names * 0.25

# filtered_bigram_groups = {
#     bigram: names
#     for bigram, names in bigram_groups.items()
#     if bigram_counts[bigram] <= bigram_frequency_threshold
# }

# End measuring time
end_time = time.time()

# Print the elapsed time
print(f"Time taken: {end_time - start_time:.2f} seconds")

Caching bigrams: 100%|██████████| 975949/975949 [00:17<00:00, 55851.30it/s]
Processing bigrams: 100%|██████████| 975949/975949 [00:18<00:00, 52328.59it/s]

Time taken: 37.03 seconds





## NAME ONLY

### Jaccard

In [7]:
def compare_bigrams_and_jaccard(name1, name2):
    name1 = "".join(sorted(name1.replace(" ", "")))  # Hilangkan spasi dan urutkan karakter
    name2 = "".join(sorted(name2.replace(" ", "")))  

    set1, set2 = set(name1), set(name2)  # Buat himpunan karakter unik
    intersection = len(set1 & set2)
    union = len(set1 | set2)

    similarity = intersection / union if union > 0 else 0  # Jaccard similarity
    return similarity

# Function to calculate Jaro-Winkler distances for names based on selected bigrams
def calculate_jaccard_distances(name, similarity_list):
    bigrams = bigrams_cache[name]  # Use precomputed bigrams
    bigram_weights = [
        (bigram, bigram_counts[bigram]) for bigram in bigrams if bigram in bigram_groups
        and bigram_counts[bigram] <= bigram_frequency_threshold  # Filter bigrams based on frequency
    ]

    # Sort bigrams by their frequency (lower frequency = higher weight)
    bigram_weights_sorted = sorted(bigram_weights, key=lambda x: x[1])

    # Select the top 3 least frequent bigrams
    selected_bigrams = [bg[0] for bg in bigram_weights_sorted[:3]]

    # Find all unique names in the groups of these 3 bigrams
    matching_names = set()
    for bigram in selected_bigrams:
        matching_names.update(bigram_groups[bigram])

    matching_names.discard(name)  # Remove the original name from comparison
    
    # Compute Jaro-Winkler similarity for each name in the matching group using bigram comparison
    distances = {}
    
    for other_name in matching_names:
        # Use the new comparison function to check bigram overlap and compute Jaro-Winkler
        similarity = compare_bigrams_and_jaccard(name, other_name)
        if similarity and similarity > 0.75:
            distances[other_name] = similarity
            similarity_list[name].append(similarity)
    return name, distances  # Return the name and its distances

def add_new_name_to_results_dict(new_name, similarity_list):
    # Generate bigrams for the new name
    new_bigrams = get_bigrams(new_name)
    
    # Add the new bigrams to the cache
    bigrams_cache[new_name] = new_bigrams
    
    # Calculate Jaro-Winkler distances for the new name
    name, distances = calculate_jaccard_distances(new_name, similarity_list)
    
    # If the new name has similar names, add it to results_list
    if distances:
        similar_names_df = pd.DataFrame(
            list(distances.items()), columns=["similar_name", "similarity"]
        )
        results_list = similar_names_df["similar_name"].tolist()  # Convert similar names to a list
    else:
        results_list = []
    results_list.append(new_name)
    return results_list

In [None]:
def compare_bigrams_and_jaccard(name1, name2):
    name1 = "".join(sorted(name1.replace(" ", "")))  # Hilangkan spasi dan urutkan karakter
    name2 = "".join(sorted(name2.replace(" ", "")))  

    set1, set2 = set(name1), set(name2)  # Buat himpunan karakter unik
    intersection = len(set1 & set2)
    union = len(set1 | set2)

    similarity = intersection / union if union > 0 else 0  # Jaccard similarity
    return similarity

In [8]:
start_time = time.time()

final_blocks = defaultdict(list)
similarity_list = defaultdict(list)

for index, row in sampled_data_100000.iterrows():
    final_blocks[row['cleaned_name']] = add_new_name_to_results_dict(row['cleaned_name'], similarity_list)

end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")

Time taken: 44075.19 seconds


## NAME & ADDRESS

In [7]:
# Function to generate bigrams
def get_bigrams(text):
    if not isinstance(text, str):  # Handle NaN or non-string values
        return set()
    text = text.replace(" ", "")  # Remove spaces
    return set("".join(pair) for pair in zip(text, text[1:]))

# Start measuring time
start_time = time.time()

# Step 1: Cache bigrams for each unique name and address
unique_cleaned_names = dataset["cleaned_name"].unique()
unique_addresses = dataset["deep_clean_address"].unique()

# Precompute bigrams
bigrams_cache_name = {name: get_bigrams(name) for name in tqdm(unique_cleaned_names, desc="Caching name bigrams")}
bigrams_cache_address = {address: get_bigrams(address) for address in tqdm(unique_addresses, desc="Caching address bigrams")}

# Step 2: Build bigram groups and counts
bigram_groups_name = defaultdict(set)
bigram_counts_name = Counter()

bigram_groups_address = defaultdict(set)
bigram_counts_address = Counter()

# Process names
for name, bigrams in bigrams_cache_name.items():
    for bigram in bigrams:
        bigram_counts_name[bigram] += 1
        bigram_groups_name[bigram].add(name)

# Process addresses
for address, bigrams in bigrams_cache_address.items():
    for bigram in bigrams:
        bigram_counts_address[bigram] += 1
        bigram_groups_address[bigram].add(address)

# Step 3: Filtering frequent bigrams
total_names = len(unique_cleaned_names)
bigram_frequency_threshold = total_names * 0.25 

# End measuring time
end_time = time.time()

# Print the elapsed time
print(f"Time taken: {end_time - start_time:.2f} seconds")

Caching name bigrams: 100%|██████████| 975949/975949 [00:25<00:00, 38600.44it/s]
Caching address bigrams: 100%|██████████| 424771/424771 [00:12<00:00, 34927.69it/s]


Time taken: 61.31 seconds


In [8]:
address_dict = defaultdict(list)

# Iterate through the dataset
for idx, row in dataset.iterrows():
    # Get the cleaned_name and deep_clean_address
    name = row['cleaned_name']
    address = row['deep_clean_address']
    
    # Skip rows with invalid names or addresses
    if not isinstance(name, str) or not name.strip() or not isinstance(address, str) or not address.strip():
        continue
    
    address_dict[address].append(name)

### Jaccard

In [9]:
def compare_bigrams_and_jaccard(name1, name2):
    name1 = "".join(sorted(name1.replace(" ", "")))  # Hilangkan spasi dan urutkan karakter
    name2 = "".join(sorted(name2.replace(" ", "")))  

    set1, set2 = set(name1), set(name2)  # Buat himpunan karakter unik
    intersection = len(set1 & set2)
    union = len(set1 | set2)

    similarity = intersection / union if union > 0 else 0  # Jaccard similarity
    return similarity
    
# Function to find similar names based on address bigrams
def calculate_jaccard_distances(name, address, similarity_list):
    # Step 1: Get the top 3 least frequent bigrams from name and address
    bigrams_name = bigrams_cache_name[name]
    bigrams_address = bigrams_cache_address[address]

    # Sort bigrams by frequency (lower frequency = more unique)
    name_bigrams_sorted = sorted(bigrams_name, key=lambda x: bigram_counts_name[x])[:3]
    address_bigrams_sorted = sorted(bigrams_address, key=lambda x: bigram_counts_address[x])[:3]
  
    list_of_names = set()
    for bigram in name_bigrams_sorted:
        list_of_names.update(bigram_groups_name[bigram])

    # Step 2: Find matching names based on address
    matching_addresses = set()
    for bigram in address_bigrams_sorted:
        matching_addresses.update(bigram_groups_address[bigram])
        
    matching_name_address_pairs = set()
    for addr in matching_addresses:
        for matched_name in address_dict[addr]:  # Get names associated with this address
            if matched_name in list_of_names:
                matching_name_address_pairs.add((matched_name, addr))

    # Remove the original (name, address) pair
    matching_name_address_pairs.discard((name, address))
    # matching_names.intersection_update(list_of_names)
    
    # Step 3: Compute Jaro-Winkler similarity for final filtered names
    distances = {}
    for other_name, other_address in matching_name_address_pairs:
        similarity = compare_bigrams_and_jaccard(name, other_name)
        if similarity and similarity > 0.75:
            distances[(other_name,other_address)] = similarity
            similarity_list[name].append(similarity)
    return name, distances

def add_new_name_to_results_dict(new_name, new_address, similarity_list):
    # Generate bigrams for the new name
    new_bigrams = get_bigrams(new_name)
    new_bigrams_address = get_bigrams(new_address)
    
    # Add the new bigrams to the cache
    bigrams_cache_name[new_name] = new_bigrams
    bigrams_cache_address[new_address] = new_bigrams_address
    
    # Calculate Jaro-Winkler distances for the new name
    name, distances = calculate_jaccard_distances(new_name, new_address, similarity_list)
    
    # If the new name has similar names, add it to results_list
    if distances:
        similar_names_df = pd.DataFrame(
            list(distances.items()), columns=["similar_name_address", "similarity"]
        )
        results_list = similar_names_df["similar_name_address"].tolist()  # Convert similar names to a list
    else:
        results_list = []
    results_list.append((new_name,new_address))
    return results_list

In [10]:
start_time = time.time()

final_blocks = defaultdict(list)
similarity_list = defaultdict(list)

for index, row in sampled_data_100000.iterrows():
    final_blocks[row['cleaned_name'], row['deep_clean_address']] = add_new_name_to_results_dict(row['cleaned_name'], row['deep_clean_address'],similarity_list)

end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")

Time taken: 17117.92 seconds


## Nama & Tempat Lahir

In [7]:
# Function to generate bigrams
def get_bigrams(text):
    if not isinstance(text, str):  # Handle NaN or non-string values
        return set()
    text = text.replace(" ", "")  # Remove spaces
    return set("".join(pair) for pair in zip(text, text[1:]))

# Start measuring time
start_time = time.time()

# Step 1: Cache bigrams for each unique name and place of birth (pob)
unique_cleaned_names = dataset["cleaned_name"].unique()
unique_pob = dataset["TEMPAT_LAHIR"].unique()

# Precompute bigrams
bigrams_cache_name = {name: get_bigrams(name) for name in tqdm(unique_cleaned_names, desc="Caching name bigrams")}
bigrams_cache_pob = {pob: get_bigrams(pob) for pob in tqdm(unique_pob, desc="Caching POB bigrams")}

# Step 2: Build bigram groups and counts
bigram_groups_name = defaultdict(set)
bigram_counts_name = Counter()

bigram_groups_pob = defaultdict(set)
bigram_counts_pob = Counter()

# Process names
for name, bigrams in bigrams_cache_name.items():
    for bigram in bigrams:
        bigram_counts_name[bigram] += 1
        bigram_groups_name[bigram].add(name)

# Process place of birth (POB)
for pob, bigrams in bigrams_cache_pob.items():
    for bigram in bigrams:
        bigram_counts_pob[bigram] += 1
        bigram_groups_pob[bigram].add(pob)

# Step 3: Filtering frequent bigrams
total_names = len(unique_cleaned_names)
bigram_frequency_threshold = total_names * 0.25 

# End measuring time
end_time = time.time()

# Print the elapsed time
print(f"Time taken: {end_time - start_time:.2f} seconds")

Caching name bigrams: 100%|██████████| 975949/975949 [00:17<00:00, 55943.01it/s] 
Caching POB bigrams: 100%|██████████| 63593/63593 [00:00<00:00, 97297.76it/s] 


Time taken: 34.32 seconds


In [8]:
pob_dict = defaultdict(list)

# Iterate through the dataset
for idx, row in dataset.iterrows():
    # Get the cleaned_name and TEMPAT_LAHIR
    name = row['cleaned_name']
    pob = row['TEMPAT_LAHIR']
    
    # Skip rows with invalid names or places of birth
    if not isinstance(name, str) or not name.strip() or not isinstance(pob, str) or not pob.strip():
        continue
    
    pob_dict[pob].append(name)

### Jaccard

In [13]:
def compare_bigrams_and_jaccard(name1, name2):
    name1 = "".join(sorted(name1.replace(" ", "")))  # Hilangkan spasi dan urutkan karakter
    name2 = "".join(sorted(name2.replace(" ", "")))  

    set1, set2 = set(name1), set(name2)  # Buat himpunan karakter unik
    intersection = len(set1 & set2)
    union = len(set1 | set2)

    similarity = intersection / union if union > 0 else 0  # Jaccard similarity
    return similarity
    
# Function to find similar names based on POB bigrams
def calculate_jaccard_distances(name, pob, similarity_list):
    # Step 1: Get the top 3 least frequent bigrams from name and POB
    bigrams_name = bigrams_cache_name[name]
    bigrams_pob = bigrams_cache_pob[pob]

    # Sort bigrams by frequency (lower frequency = more unique)
    name_bigrams_sorted = sorted(bigrams_name, key=lambda x: bigram_counts_name[x])[:3]
    pob_bigrams_sorted = sorted(bigrams_pob, key=lambda x: bigram_counts_pob[x])[:3]

    list_of_names = set()
    for bigram in name_bigrams_sorted:
        list_of_names.update(bigram_groups_name[bigram])

    # Step 2: Find matching names based on POB
    matching_pobs = set()
    for bigram in pob_bigrams_sorted:
        matching_pobs.update(bigram_groups_pob[bigram])
        
    matching_name_pob_pairs = set()
    for place in matching_pobs:
        for matched_name in pob_dict[place]:  # Get names associated with this POB
            if matched_name in list_of_names:
                matching_name_pob_pairs.add((matched_name, place))

    # Remove the original (name, pob) pair
    matching_name_pob_pairs.discard((name, pob))

    # Step 3: Compute Jaro-Winkler similarity for final filtered names
    distances = {}
    for other_name, other_pob in matching_name_pob_pairs:
        similarity = compare_bigrams_and_jaccard(name, other_name)
        if similarity and similarity > 0.75:
            distances[(other_name, other_pob)] = similarity
            similarity_list[name].append(similarity)
    return name, distances

def add_new_name_to_results_dict(new_name, new_pob, similarity_list):
    # Generate bigrams for the new name
    new_bigrams = get_bigrams(new_name)
    new_bigrams_pob = get_bigrams(new_pob)
    
    # Add the new bigrams to the cache
    bigrams_cache[new_name] = new_bigrams
    bigrams_cache[new_pob] = new_bigrams_pob
    
    # Calculate Jaro-Winkler distances for the new name
    name, distances = calculate_jaccard_distances(new_name, new_pob, similarity_list)

    # If the new name has similar names, add it to results_list
    if distances:
        similar_names_df = pd.DataFrame(
            list(distances.items()), columns=["similar_name_pob", "similarity"]
        )
        results_list = similar_names_df["similar_name_pob"].tolist()  # Convert similar names to a list
    else:
        results_list = []
        
    results_list.append((new_name, new_pob))
    return results_list

In [16]:
start_time = time.time()

final_blocks = defaultdict(list)
similarity_list = defaultdict(list)

for index, row in sampled_data_100000.iterrows():
    final_blocks[row['cleaned_name'], row['TEMPAT_LAHIR']] = add_new_name_to_results_dict(
        row['cleaned_name'], row['TEMPAT_LAHIR'], similarity_list
    )

end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")


Time taken: 15698.31 seconds


# Pair Completeness 

In [10]:
sid_monre_0924 = pd.DataFrame(columns=['NO_AGGR', 'cleaned_name', 'SID'])
mst_sid_0924 = pd.DataFrame(columns=['SID', 'cleaned_name'])

In [None]:
#  NAME ONLY
monre_dict = defaultdict(list)

for index, row in dataset.iterrows():
    monre_dict[row['cleaned_name']].append(index)
    
# NAME ADDRESS
monre_dict = defaultdict(list)

for index, row in dataset.iterrows():
    monre_dict[(row['cleaned_name'], row['deep_clean_address'])].append(index)

# NAME TEMPAT LAHIR
monre_dict = defaultdict(list)

for index, row in dataset.iterrows():
    monre_dict[(row['cleaned_name'], row['TEMPAT_LAHIR'])].append(index)

## Name Only

### Jaccard

In [13]:
def calculate_jaccard_distances(name):
    bigrams = bigrams_cache[name]  # Use precomputed bigrams
    bigram_weights = [
        (bigram, bigram_counts[bigram]) for bigram in bigrams if bigram in bigram_groups
        and bigram_counts[bigram] <= bigram_frequency_threshold  # Filter bigrams based on frequency
    ]

    # Sort bigrams by their frequency (lower frequency = higher weight)
    bigram_weights_sorted = sorted(bigram_weights, key=lambda x: x[1])

    # Select the top 3 least frequent bigrams
    selected_bigrams = [bg[0] for bg in bigram_weights_sorted[:3]]

    # Find all unique names in the groups of these 3 bigrams
    matching_names = set()
    for bigram in selected_bigrams:
        matching_names.update(bigram_groups[bigram])

    matching_names.discard(name)  # Remove the original name from comparison
    
    # Compute Jaro-Winkler similarity for each name in the matching group using bigram comparison
    distances = {}
    
    for other_name in matching_names:
        # Use the new comparison function to check bigram overlap and compute Jaro-Winkler
        similarity = compare_bigrams_and_jaccard(name, other_name)
        if similarity and similarity > 0.75:
            distances[other_name] = similarity

    return name, distances  # Return the name and its distances

def add_new_name_to_results_dict(new_name):
    # Generate bigrams for the new name
    new_bigrams = get_bigrams(new_name)
    
    # Add the new bigrams to the cache
    bigrams_cache[new_name] = new_bigrams
    
    # Calculate Jaro-Winkler distances for the new name
    name, distances = calculate_jaccard_distances(new_name)
    
    # If the new name has similar names, add it to results_list
    if distances:
        similar_names_df = pd.DataFrame(
            list(distances.items()), columns=["similar_name", "similarity"]
        )
        results_list = similar_names_df["similar_name"].tolist()  # Convert similar names to a list
    else:
        results_list = []
    results_list.append(new_name)
    return results_list

## Name & Address

### Jaccard

In [14]:
# Function to find similar names based on address bigrams
def calculate_levenshtein_distances(name, address):
    # Step 1: Get the top 3 least frequent bigrams from name and address
    bigrams_name = bigrams_cache_name[name]
    bigrams_address = bigrams_cache_address[address]

    # Sort bigrams by frequency (lower frequency = more unique)
    name_bigrams_sorted = sorted(bigrams_name, key=lambda x: bigram_counts_name[x])[:3]
    address_bigrams_sorted = sorted(bigrams_address, key=lambda x: bigram_counts_address[x])[:3]

    list_of_names = set()
    for bigram in name_bigrams_sorted:
        list_of_names.update(bigram_groups_name[bigram])

    # Step 2: Find matching names based on address
    matching_addresses = set()
    for bigram in address_bigrams_sorted:
        matching_addresses.update(bigram_groups_address[bigram])
        
    matching_name_address_pairs = set()
    for addr in matching_addresses:
        for matched_name in address_dict[addr]:  # Get names associated with this address
            if matched_name in list_of_names:
                matching_name_address_pairs.add((matched_name, addr))

    # Remove the original (name, address) pair
    matching_name_address_pairs.discard((name, address))
    # matching_names.intersection_update(list_of_names)
    
    # Step 3: Compute Jaro-Winkler similarity for final filtered names
    distances = {}
    for other_name, other_address in matching_name_address_pairs:
        similarity = compare_bigrams_and_jaccard(name, other_name)
        if similarity and similarity > 0.75:
            distances[(other_name,other_address)] = similarity

    return name, distances

def add_new_name_to_results_dict(new_name, new_address):
    # Generate bigrams for the new name
    new_bigrams = get_bigrams(new_name)
    new_bigrams_address = get_bigrams(new_address)
    
    # Add the new bigrams to the cache
    bigrams_cache_name[new_name] = new_bigrams
    bigrams_cache_address[new_address] = new_bigrams_address
    
    # Calculate Jaro-Winkler distances for the new name
    name, distances = calculate_levenshtein_distances(new_name, new_address)
    
    # If the new name has similar names, add it to results_list
    if distances:
        similar_names_df = pd.DataFrame(
            list(distances.items()), columns=["similar_name_address", "similarity"]
        )
        results_list = similar_names_df["similar_name_address"].tolist()  # Convert similar names to a list
    else:
        results_list = []
    results_list.append((new_name,new_address))
    return results_list

## NAME & TEMPAT LAHIR

### JACCARD

In [None]:
# Function to find similar names based on POB bigrams
def calculate_jaccard_distances(name, pob):
    # Step 1: Get the top 3 least frequent bigrams from name and POB
    bigrams_name = bigrams_cache_name[name]
    bigrams_pob = bigrams_cache_pob[pob]

    # Sort bigrams by frequency (lower frequency = more unique)
    name_bigrams_sorted = sorted(bigrams_name, key=lambda x: bigram_counts_name[x])[:3]
    pob_bigrams_sorted = sorted(bigrams_pob, key=lambda x: bigram_counts_pob[x])[:3]

    list_of_names = set()
    for bigram in name_bigrams_sorted:
        list_of_names.update(bigram_groups_name[bigram])

    # Step 2: Find matching names based on POB
    matching_pobs = set()
    for bigram in pob_bigrams_sorted:
        matching_pobs.update(bigram_groups_pob[bigram])
        
    matching_name_pob_pairs = set()
    for place in matching_pobs:
        for matched_name in pob_dict[place]:  # Get names associated with this POB
            if matched_name in list_of_names:
                matching_name_pob_pairs.add((matched_name, place))

    # Remove the original (name, pob) pair
    matching_name_pob_pairs.discard((name, pob))

    # Step 3: Compute Jaro-Winkler similarity for final filtered names
    distances = {}
    for other_name, other_pob in matching_name_pob_pairs:
        similarity = compare_bigrams_and_jaccard(name, other_name)
        if similarity and similarity > 0.75:
            distances[(other_name, other_pob)] = similarity
    return name, distances

def add_new_name_to_results_dict(new_name, new_pob):
    # Generate bigrams for the new name
    new_bigrams = get_bigrams(new_name)
    new_bigrams_pob = get_bigrams(new_pob)
    
    # Add the new bigrams to the cache
    bigrams_cache[new_name] = new_bigrams
    bigrams_cache[new_pob] = new_bigrams_pob
    
    # Calculate Jaro-Winkler distances for the new name
    name, distances = calculate_jaccard_distances(new_name, new_pob)

    # If the new name has similar names, add it to results_list
    if distances:
        similar_names_df = pd.DataFrame(
            list(distances.items()), columns=["similar_name_pob", "similarity"]
        )
        results_list = similar_names_df["similar_name_pob"].tolist()  # Convert similar names to a list
    else:
        results_list = []
        
    results_list.append((new_name, new_pob))
    return results_list

## Matching Rules

In [17]:
def jaro_winkler_match(value1, value2, threshold=0.92):
    if pd.notna(value1) and pd.notna(value2):
        value1_str = str(value1).strip() 
        value2_str = str(value2).strip() 
        
        # Ensure neither value is an empty string
        if value1_str and value2_str:
            similarity = distance.get_jaro_distance(value1_str, value2_str)
            return similarity >= threshold
    return False

In [None]:
# NAME ONLY
start_time = time.time()
count = 0
no_aggr_set = set(sid_monre_0924['NO_AGGR'])

for index, row in sampled_data_100000.iterrows():
    if row['NO_AGGR'] not in no_aggr_set:
        row_compared = row
        matched_dfs = [] 
        results_list = add_new_name_to_results_dict(row['cleaned_name'])
        for name in results_list:
            if name in monre_dict:
                indices = monre_dict[name]
                matched_dfs.append(dataset.loc[indices])  

        if matched_dfs:
            matched_df = pd.concat(matched_dfs, ignore_index=True)
        else:
            matched_df = pd.DataFrame()

        if not matched_df.empty:
            result_df_nodup = matched_df.drop_duplicates(subset='NO_AGGR').reset_index(drop=True)
        else:
            result_df_nodup = pd.DataFrame()

        time_start = time.time()
        aggr_compared = row['NO_AGGR']
        name_compared = row['cleaned_name']
        dob_compared = row['TGL_LAHIR']
        tempat_compared = row['cleaned_TEMPAT_LAHIR']
        ktp_kitas_compared = row['cleaned_no_ktp']
        mother_name_compared = row['cleaned_NAMA_IBU_KANDUNG'] 
        npwp_compared = row['cleaned_no_npwp']
        address_compared = row['cleaned_alamat']

        # Filter result_df based on matching criteria
        filtered_result_df = result_df_nodup[
            ((pd.notna(result_df_nodup['TGL_LAHIR']) & pd.notna(dob_compared) & 
            (result_df_nodup['TGL_LAHIR'] == dob_compared)) |
            (pd.notna(result_df_nodup['cleaned_no_ktp']) & pd.notna(ktp_kitas_compared) & 
            (result_df_nodup['cleaned_no_ktp'] == ktp_kitas_compared)) |
            (pd.notna(result_df_nodup['cleaned_NAMA_IBU_KANDUNG']) & pd.notna(mother_name_compared) & 
            (result_df_nodup['cleaned_NAMA_IBU_KANDUNG'] == mother_name_compared)) |
            (pd.notna(result_df_nodup['cleaned_no_npwp']) & pd.notna(npwp_compared) & 
            (result_df_nodup['cleaned_no_npwp'] == npwp_compared))
        )]
        
        filtered_result_df = filtered_result_df.copy()
        filtered_result_df['flag_SID'] = 'N'  
        filtered_result_df['rule_num'] = None

        for index, row in filtered_result_df.iterrows():
            name_sim = jaro_winkler_match(row['cleaned_name'], name_compared)
            
            if (
                name_sim and 
                pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                pd.notna(row['cleaned_TEMPAT_LAHIR']) and row['cleaned_TEMPAT_LAHIR'] == tempat_compared and 
                pd.notna(row['cleaned_NAMA_IBU_KANDUNG']) and jaro_winkler_match(row['cleaned_NAMA_IBU_KANDUNG'], mother_name_compared)  # Rule 1
            ):
                filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                filtered_result_df.loc[index, 'rule_num'] = 'RULE 1'
                # print('MATCHING RULE 1 APPLIED')

            elif (
                name_sim and 
                pd.notna(row['cleaned_no_ktp']) and row['cleaned_no_ktp'] == ktp_kitas_compared  # Rule 2
            ):
                filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                filtered_result_df.loc[index, 'rule_num'] = 'RULE 2'
                # print('MATCHING RULE 2 APPLIED')

            elif (
                name_sim and 
                pd.notna(row['cleaned_no_npwp']) and row['cleaned_no_npwp'] == npwp_compared  # Rule 3
            ):
                filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                filtered_result_df.loc[index, 'rule_num'] = 'RULE 3'
                # print('MATCHING RULE 3 APPLIED')

            elif (
                jaro_winkler_match(row['cleaned_name'], name_compared, threshold=0.95) and 
                pd.notna(row['cleaned_alamat']) and jaro_winkler_match(row['cleaned_alamat'], address_compared) and  # Rule 4
                pd.notna(row['cleaned_NAMA_IBU_KANDUNG']) and jaro_winkler_match(row['cleaned_NAMA_IBU_KANDUNG'], mother_name_compared)  
            ):
                filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                filtered_result_df.loc[index, 'rule_num'] = 'RULE 4' 
                # print('MATCHING RULE 4 APPLIED')

            elif (
                pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                pd.notna(row['cleaned_no_ktp']) and row['cleaned_no_ktp'] == ktp_kitas_compared  # Rule 5
            ):
                filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                filtered_result_df.loc[index, 'rule_num'] = 'RULE 5'  
                # print('MATCHING RULE 5 APPLIED')
            
            elif (
                jaro_winkler_match(row['cleaned_name'], name_compared, threshold=0.95) and 
                pd.notna(row['cleaned_alamat']) and jaro_winkler_match(row['cleaned_alamat'], address_compared) and
                pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                pd.notna(row['cleaned_TEMPAT_LAHIR']) and row['cleaned_TEMPAT_LAHIR'] == tempat_compared # Rule 6
            ):
                filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                filtered_result_df.loc[index, 'rule_num'] = 'RULE 6' 
                # print('MATCHING RULE 6 APPLIED')

            else:
                filtered_result_df.loc[index, 'flag_SID'] = 'N'
                filtered_result_df.loc[index, 'rule_num'] = None 

        filtered_result_df = filtered_result_df[filtered_result_df['flag_SID'] == 'Y'].reset_index(drop=True)
        
        if filtered_result_df.empty:
            row_compared_df = pd.DataFrame([row_compared])  
            row_compared_df['flag_SID'] = 'Y'              
            filtered_result_df = pd.concat([filtered_result_df, row_compared_df], ignore_index=True)

        check_sid_exist = False
        for no_aggr in filtered_result_df['NO_AGGR']:
            if no_aggr in no_aggr_set:
                check_sid_exist = True
                filtered_result_df['SID'] = sid_monre_0924.loc[sid_monre_0924['NO_AGGR'] == no_aggr, 'SID'].values[0]
                break
            
        if check_sid_exist:
            count += 1

        if not check_sid_exist:
            filtered_result_df = filtered_result_df.sort_values(by='DT_GOLIVE_VALID').reset_index(drop=True)
            
            kode_cabang = str(filtered_result_df.loc[0, 'CD_SP'])
            
            matching_sid_rows = mst_sid_0924[mst_sid_0924['SID'].str[:6] == kode_cabang]
            last_sequence = matching_sid_rows['SID'].str[-7:].astype(int).max() if not matching_sid_rows.empty else 0
            
            filtered_result_df['SID'] = f"{kode_cabang}{(last_sequence + 1):07d}"

        rows_to_append_filtered = filtered_result_df[~filtered_result_df['NO_AGGR'].isin(no_aggr_set)]

        if not rows_to_append_filtered.empty:  
            sid_monre_0924 = pd.concat([sid_monre_0924, rows_to_append_filtered[['NO_AGGR', 'cleaned_name', 'SID', 'rule_num']]], ignore_index=True)
            no_aggr_set.update(rows_to_append_filtered['NO_AGGR'].tolist())
            if not check_sid_exist:
                rows_to_append_2 = rows_to_append_filtered.iloc[[0]][['cleaned_name', 'SID']]
                mst_sid_0924 = pd.concat([mst_sid_0924, rows_to_append_2], ignore_index=True)

mst_sid_0924.reset_index(drop=True, inplace=True)

# End measuring time
end_time = time.time()

# Print the elapsed time
print(f"Time taken: {end_time - start_time:.2f} seconds")

In [18]:
# NAME & ADDRESS
start_time = time.time()
count = 0
no_aggr_set = set(sid_monre_0924['NO_AGGR'])

for index, row in sampled_data_100000.head(5).iterrows():
    if row['NO_AGGR'] not in no_aggr_set:
        row_compared = row
        matched_dfs = []
        if row['deep_clean_address'] and row['deep_clean_address'] != 'nan':
            results_list = add_new_name_to_results_dict(row['cleaned_name'], row['deep_clean_address'])
            for tuple in results_list:
                if tuple in monre_dict:
                    indices = monre_dict[tuple]
                    matched_dfs.append(dataset.loc[indices])  

            if matched_dfs:
                matched_df = pd.concat(matched_dfs, ignore_index=True)
                result_df_nodup = matched_df.drop_duplicates(subset='NO_AGGR').reset_index(drop=True)
            # else:
            #     result_df_nodup = pd.DataFrame()
            #     print(row['cleaned_name'])
            #     print(row['deep_clean_address'])

                aggr_compared = row['NO_AGGR']
                name_compared = row['cleaned_name']
                dob_compared = row['TGL_LAHIR']
                tempat_compared = row['cleaned_TEMPAT_LAHIR']
                ktp_kitas_compared = row['cleaned_no_ktp']
                mother_name_compared = row['cleaned_NAMA_IBU_KANDUNG'] 
                npwp_compared = row['cleaned_no_npwp']
                address_compared = row['cleaned_alamat']

                # Filter result_df based on matching criteria
                filtered_result_df = result_df_nodup[
                    ((pd.notna(result_df_nodup['TGL_LAHIR']) & pd.notna(dob_compared) & 
                    (result_df_nodup['TGL_LAHIR'] == dob_compared)) |
                    (pd.notna(result_df_nodup['cleaned_no_ktp']) & pd.notna(ktp_kitas_compared) & 
                    (result_df_nodup['cleaned_no_ktp'] == ktp_kitas_compared)) |
                    (pd.notna(result_df_nodup['cleaned_NAMA_IBU_KANDUNG']) & pd.notna(mother_name_compared) & 
                    (result_df_nodup['cleaned_NAMA_IBU_KANDUNG'] == mother_name_compared)) |
                    (pd.notna(result_df_nodup['cleaned_no_npwp']) & pd.notna(npwp_compared) & 
                    (result_df_nodup['cleaned_no_npwp'] == npwp_compared))
                )]
                
                filtered_result_df = filtered_result_df.copy()
                filtered_result_df['flag_SID'] = 'N'  
                filtered_result_df['rule_num'] = None

                for index, row in filtered_result_df.iterrows():
                    name_sim = jaro_winkler_match(row['cleaned_name'], name_compared)
                    
                    if (
                        name_sim and 
                        pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                        pd.notna(row['cleaned_TEMPAT_LAHIR']) and row['cleaned_TEMPAT_LAHIR'] == tempat_compared and 
                        pd.notna(row['cleaned_NAMA_IBU_KANDUNG']) and jaro_winkler_match(row['cleaned_NAMA_IBU_KANDUNG'], mother_name_compared)  # Rule 1
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 1'
                        # print('MATCHING RULE 1 APPLIED')

                    elif (
                        name_sim and 
                        pd.notna(row['cleaned_no_ktp']) and row['cleaned_no_ktp'] == ktp_kitas_compared  # Rule 2
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 2'
                        # print('MATCHING RULE 2 APPLIED')

                    elif (
                        name_sim and 
                        pd.notna(row['cleaned_no_npwp']) and row['cleaned_no_npwp'] == npwp_compared  # Rule 3
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 3'
                        # print('MATCHING RULE 3 APPLIED')

                    elif (
                        jaro_winkler_match(row['cleaned_name'], name_compared, threshold=0.95) and 
                        pd.notna(row['cleaned_alamat']) and jaro_winkler_match(row['cleaned_alamat'], address_compared) and  # Rule 4
                        pd.notna(row['cleaned_NAMA_IBU_KANDUNG']) and jaro_winkler_match(row['cleaned_NAMA_IBU_KANDUNG'], mother_name_compared)  
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 4' 
                        # print('MATCHING RULE 4 APPLIED')

                    elif (
                        pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                        pd.notna(row['cleaned_no_ktp']) and row['cleaned_no_ktp'] == ktp_kitas_compared  # Rule 5
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 5'  
                        # print('MATCHING RULE 5 APPLIED')
                    
                    elif (
                        jaro_winkler_match(row['cleaned_name'], name_compared, threshold=0.95) and 
                        pd.notna(row['cleaned_alamat']) and jaro_winkler_match(row['cleaned_alamat'], address_compared) and
                        pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                        pd.notna(row['cleaned_TEMPAT_LAHIR']) and row['cleaned_TEMPAT_LAHIR'] == tempat_compared # Rule 6
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 6' 
                        # print('MATCHING RULE 6 APPLIED')

                    else:
                        filtered_result_df.loc[index, 'flag_SID'] = 'N'
                        filtered_result_df.loc[index, 'rule_num'] = None 

                filtered_result_df = filtered_result_df[filtered_result_df['flag_SID'] == 'Y'].reset_index(drop=True)
                
                if filtered_result_df.empty:
                    row_compared_df = pd.DataFrame([row_compared])  
                    row_compared_df['flag_SID'] = 'Y'              
                    filtered_result_df = pd.concat([filtered_result_df, row_compared_df], ignore_index=True)

                check_sid_exist = False
                for no_aggr in filtered_result_df['NO_AGGR']:
                    if no_aggr in no_aggr_set:
                        check_sid_exist = True
                        filtered_result_df['SID'] = sid_monre_0924.loc[sid_monre_0924['NO_AGGR'] == no_aggr, 'SID'].values[0]
                        break
                    
                if check_sid_exist:
                    count += 1

                if not check_sid_exist:
                    filtered_result_df = filtered_result_df.sort_values(by='DT_GOLIVE_VALID').reset_index(drop=True)
                    
                    kode_cabang = str(filtered_result_df.loc[0, 'CD_SP'])
                    
                    matching_sid_rows = mst_sid_0924[mst_sid_0924['SID'].str[:6] == kode_cabang]
                    last_sequence = matching_sid_rows['SID'].str[-7:].astype(int).max() if not matching_sid_rows.empty else 0
                    
                    filtered_result_df['SID'] = f"{kode_cabang}{(last_sequence + 1):07d}"

                rows_to_append_filtered = filtered_result_df[~filtered_result_df['NO_AGGR'].isin(no_aggr_set)]

                if not rows_to_append_filtered.empty:  
                    sid_monre_0924 = pd.concat([sid_monre_0924, rows_to_append_filtered[['NO_AGGR', 'cleaned_name', 'SID', 'rule_num']]], ignore_index=True)
                    no_aggr_set.update(rows_to_append_filtered['NO_AGGR'].tolist())
                    if not check_sid_exist:
                        rows_to_append_2 = rows_to_append_filtered.iloc[[0]][['cleaned_name', 'SID']]
                        mst_sid_0924 = pd.concat([mst_sid_0924, rows_to_append_2], ignore_index=True)

mst_sid_0924.reset_index(drop=True, inplace=True)

# End measuring time
end_time = time.time()

# Print the elapsed time
print(f"Time taken: {end_time - start_time:.2f} seconds")

Time taken: 6.04 seconds


In [None]:
# NAME & TEMPAT LAHIR
start_time = time.time()
count = 0
no_aggr_set = set(sid_monre_0924['NO_AGGR'])

for index, row in sampled_data_100000.head(5).iterrows():
    if row['NO_AGGR'] not in no_aggr_set:
        row_compared = row
        matched_dfs = []
        if row['TEMPAT_LAHIR'] and row['TEMPAT_LAHIR'] != 'nan':
            results_list = add_new_name_to_results_dict(row['cleaned_name'], row['TEMPAT_LAHIR'])
            for tuple in results_list:
                if tuple in monre_dict:
                    indices = monre_dict[tuple]
                    matched_dfs.append(dataset.loc[indices])  

            if matched_dfs:
                matched_df = pd.concat(matched_dfs, ignore_index=True)
                result_df_nodup = matched_df.drop_duplicates(subset='NO_AGGR').reset_index(drop=True)
            # else:
            #     result_df_nodup = pd.DataFrame()
            #     print(row['cleaned_name'])
            #     print(row['deep_clean_address'])

                aggr_compared = row['NO_AGGR']
                name_compared = row['cleaned_name']
                dob_compared = row['TGL_LAHIR']
                tempat_compared = row['cleaned_TEMPAT_LAHIR']
                ktp_kitas_compared = row['cleaned_no_ktp']
                mother_name_compared = row['cleaned_NAMA_IBU_KANDUNG'] 
                npwp_compared = row['cleaned_no_npwp']
                address_compared = row['cleaned_alamat']

                # Filter result_df based on matching criteria
                filtered_result_df = result_df_nodup[
                    ((pd.notna(result_df_nodup['TGL_LAHIR']) & pd.notna(dob_compared) & 
                    (result_df_nodup['TGL_LAHIR'] == dob_compared)) |
                    (pd.notna(result_df_nodup['cleaned_no_ktp']) & pd.notna(ktp_kitas_compared) & 
                    (result_df_nodup['cleaned_no_ktp'] == ktp_kitas_compared)) |
                    (pd.notna(result_df_nodup['cleaned_NAMA_IBU_KANDUNG']) & pd.notna(mother_name_compared) & 
                    (result_df_nodup['cleaned_NAMA_IBU_KANDUNG'] == mother_name_compared)) |
                    (pd.notna(result_df_nodup['cleaned_no_npwp']) & pd.notna(npwp_compared) & 
                    (result_df_nodup['cleaned_no_npwp'] == npwp_compared))
                )]
                
                filtered_result_df = filtered_result_df.copy()
                filtered_result_df['flag_SID'] = 'N'  
                filtered_result_df['rule_num'] = None

                for index, row in filtered_result_df.iterrows():
                    name_sim = jaro_winkler_match(row['cleaned_name'], name_compared)
                    
                    if (
                        name_sim and 
                        pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                        pd.notna(row['cleaned_TEMPAT_LAHIR']) and row['cleaned_TEMPAT_LAHIR'] == tempat_compared and 
                        pd.notna(row['cleaned_NAMA_IBU_KANDUNG']) and jaro_winkler_match(row['cleaned_NAMA_IBU_KANDUNG'], mother_name_compared)  # Rule 1
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 1'
                        # print('MATCHING RULE 1 APPLIED')

                    elif (
                        name_sim and 
                        pd.notna(row['cleaned_no_ktp']) and row['cleaned_no_ktp'] == ktp_kitas_compared  # Rule 2
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 2'
                        # print('MATCHING RULE 2 APPLIED')

                    elif (
                        name_sim and 
                        pd.notna(row['cleaned_no_npwp']) and row['cleaned_no_npwp'] == npwp_compared  # Rule 3
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 3'
                        # print('MATCHING RULE 3 APPLIED')

                    elif (
                        jaro_winkler_match(row['cleaned_name'], name_compared, threshold=0.95) and 
                        pd.notna(row['cleaned_alamat']) and jaro_winkler_match(row['cleaned_alamat'], address_compared) and  # Rule 4
                        pd.notna(row['cleaned_NAMA_IBU_KANDUNG']) and jaro_winkler_match(row['cleaned_NAMA_IBU_KANDUNG'], mother_name_compared)  
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 4' 
                        # print('MATCHING RULE 4 APPLIED')

                    elif (
                        pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                        pd.notna(row['cleaned_no_ktp']) and row['cleaned_no_ktp'] == ktp_kitas_compared  # Rule 5
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 5'  
                        # print('MATCHING RULE 5 APPLIED')
                    
                    elif (
                        jaro_winkler_match(row['cleaned_name'], name_compared, threshold=0.95) and 
                        pd.notna(row['cleaned_alamat']) and jaro_winkler_match(row['cleaned_alamat'], address_compared) and
                        pd.notna(row['TGL_LAHIR']) and row['TGL_LAHIR'] == dob_compared and 
                        pd.notna(row['cleaned_TEMPAT_LAHIR']) and row['cleaned_TEMPAT_LAHIR'] == tempat_compared # Rule 6
                    ):
                        filtered_result_df.loc[index, 'flag_SID'] = 'Y'
                        filtered_result_df.loc[index, 'rule_num'] = 'RULE 6' 
                        # print('MATCHING RULE 6 APPLIED')

                    else:
                        filtered_result_df.loc[index, 'flag_SID'] = 'N'
                        filtered_result_df.loc[index, 'rule_num'] = None 

                filtered_result_df = filtered_result_df[filtered_result_df['flag_SID'] == 'Y'].reset_index(drop=True)
                
                if filtered_result_df.empty:
                    row_compared_df = pd.DataFrame([row_compared])  
                    row_compared_df['flag_SID'] = 'Y'              
                    filtered_result_df = pd.concat([filtered_result_df, row_compared_df], ignore_index=True)

                check_sid_exist = False
                for no_aggr in filtered_result_df['NO_AGGR']:
                    if no_aggr in no_aggr_set:
                        check_sid_exist = True
                        filtered_result_df['SID'] = sid_monre_0924.loc[sid_monre_0924['NO_AGGR'] == no_aggr, 'SID'].values[0]
                        break
                    
                if check_sid_exist:
                    count += 1

                if not check_sid_exist:
                    filtered_result_df = filtered_result_df.sort_values(by='DT_GOLIVE_VALID').reset_index(drop=True)
                    
                    kode_cabang = str(filtered_result_df.loc[0, 'CD_SP'])
                    
                    matching_sid_rows = mst_sid_0924[mst_sid_0924['SID'].str[:6] == kode_cabang]
                    last_sequence = matching_sid_rows['SID'].str[-7:].astype(int).max() if not matching_sid_rows.empty else 0
                    
                    filtered_result_df['SID'] = f"{kode_cabang}{(last_sequence + 1):07d}"

                rows_to_append_filtered = filtered_result_df[~filtered_result_df['NO_AGGR'].isin(no_aggr_set)]

                if not rows_to_append_filtered.empty:  
                    sid_monre_0924 = pd.concat([sid_monre_0924, rows_to_append_filtered[['NO_AGGR', 'cleaned_name', 'SID', 'rule_num']]], ignore_index=True)
                    no_aggr_set.update(rows_to_append_filtered['NO_AGGR'].tolist())
                    if not check_sid_exist:
                        rows_to_append_2 = rows_to_append_filtered.iloc[[0]][['cleaned_name', 'SID']]
                        mst_sid_0924 = pd.concat([mst_sid_0924, rows_to_append_2], ignore_index=True)

mst_sid_0924.reset_index(drop=True, inplace=True)

# End measuring time
end_time = time.time()

# Print the elapsed time
print(f"Time taken: {end_time - start_time:.2f} seconds")

In [None]:
mst_sid_0924.to_excel(r"D:\Kuliah\SEMESTER 8\Skripsi\Pair Completion\mst_sid_bigram_jaccard.xlsx", index=False)
sid_monre_0924.to_excel(r"D:\Kuliah\SEMESTER 8\Skripsi\Pair Completion\sid_monre_bigram_jaccard.xlsx", index=False)

# Evaluation

In [17]:
n = len(dataset) 
m = len(sampled_data_100000) 
total_comparisons_before = n * m

# Calculate Total Comparisons After Blocking
comparisons_after_blocking = 0
for block in final_blocks.values():
    comparisons_after_blocking += len(block)

# Calculate Reduction Ratio
reduction_ratio = 1 - (comparisons_after_blocking / total_comparisons_before)
print(f"Reduction Ratio: {reduction_ratio:.4f}")
print(comparisons_after_blocking)

Reduction Ratio: 0.9999
13808640


In [None]:
print("Dataset Lenght :", len(dataset), "Data")

Dataset Lenght : 2095454 Data


In [None]:
print("Sample Lenght :", len(sampled_data_100000), "Data")

Sample Lenght : 100000 Data


In [18]:
avg_distance = []
for compared_name, distances in similarity_list.items():
    avg_distance.append(np.mean(distances))

# Calculate the overall average
overall_avg_distance = np.mean(avg_distance)
print('Average Distance : ',overall_avg_distance)

Average Distance :  0.8197966792492957
