# Membuka File CSV Yang Error
By: Aditya Hanif
<br />
<br />
File CSV di sini mengalami error karena didalamnya ada salah satu text mengandung special character sehingga tidak bisa dibaca oleh Python. Contohnya adalah karakter seperti di bawah ini

## Import Library Yang Dibutuhkan

In [1]:
import pandas as pd
import numpy as np

## Buka File Menjadi CSV

In [27]:
df = pd.read_csv(
    "Documents/Data Hanif/250501_DATA_BPJS_MEI_2025_BY_NAME.csv")

ParserError: Error tokenizing data. C error: Expected 1 fields in line 326, saw 2


## Convert CSV ke XLSX
Karena ternyata error saat membuka CSV, coba convert dulu ke XLSX

In [2]:
from openpyxl import Workbook
import csv

In [None]:
wb = Workbook()
ws = wb.active
with open('Documents/Data Hanif/250501_DATA_BPJS_MEI_2025_BY_NAME.csv', 'r', encoding="latin1") as f:
    for row in csv.reader(f):
        ws.append(row)
wb.save('Documents/Data Hanif/Mei.csv')

In [None]:
'''
import csv
import re
from openpyxl import Workbook

# regex untuk hapus karakter ilegal (ASCII 0-31 kecuali \t, \n, \r)
ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')

wb = Workbook()
ws = wb.active

with open('Documents/Data Hanif/250501_DATA_BPJS_MEI_2025_BY_NAME.csv', 'r', encoding='latin1') as f:
    reader = csv.reader(f, delimiter='|')  # pakai delimiter '|' kalau CSV pakai ini
    for row in reader:
        cleaned_row = [ILLEGAL_CHARACTERS_RE.sub("", cell) for cell in row]
        ws.append(cleaned_row)

wb.save('Documents/Data Hanif/Mei.xlsx')
'''

## Convert CSV ke XLSX Dengan Mengabaikan Character Error
Namun proses convert diskip karena file terlalu besar, jadi kode di awal hanya untuk mendeteksi encoding, delimiter, dan jumlah baris.

In [4]:
import csv
import re
from openpyxl import Workbook
from tqdm import tqdm  # pip install tqdm
import chardet         # pip install chardet

# regex untuk hapus karakter ilegal (ASCII 0-31 kecuali tab/newline/carriage return)
ILLEGAL_CHARACTERS_RE = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')

input_file = "Documents/Data Hanif/250501_DATA_BPJS_MEI_2025_BY_NAME.csv"
output_file = "Documents/Data Hanif/Mei.xlsx"

# --- 1. Deteksi encoding otomatis ---
with open(input_file, "rb") as f:
    raw_data = f.read(10000)  # baca sebagian (50KB) untuk deteksi
    result = chardet.detect(raw_data)
    encoding = result["encoding"] or "latin1"
print(f"🔍 Detected encoding: {encoding}")

# --- 2. Deteksi delimiter otomatis ---
with open(input_file, "r", encoding=encoding, errors="replace") as f:
    sample = f.read(2048)
    dialect = csv.Sniffer().sniff(sample, delimiters=[",", "|"])
    delimiter = dialect.delimiter
print(f"🔍 Detected delimiter: '{delimiter}'")

# --- 3. Hitung total baris untuk progress ---
with open(input_file, "r", encoding=encoding, errors="replace") as f:
    total_lines = sum(1 for _ in f)

print(f"🔍 Total Lines: '{total_lines}'")

'''
# --- 4. Proses CSV dan simpan ke Excel ---
wb = Workbook()
ws = wb.active

with open(input_file, "r", encoding=encoding, errors="replace") as f:
    reader = csv.reader(f, delimiter=delimiter)
    for row in tqdm(reader, total=total_lines, desc="Memproses data"):
        cleaned_row = [ILLEGAL_CHARACTERS_RE.sub("", cell) for cell in row]
        ws.append(cleaned_row)

wb.save(output_file)
print(f"✅ Proses selesai! File berhasil disimpan di: {output_file}")
'''

🔍 Detected encoding: ascii
,🔍 Detected delimiter: '|'
,🔍 Total Lines: '16202898'


'\n# --- 4. Proses CSV dan simpan ke Excel ---\nwb = Workbook()\nws = wb.active\n\nwith open(input_file, "r", encoding=encoding, errors="replace") as f:\n    reader = csv.reader(f, delimiter=delimiter)\n    for row in tqdm(reader, total=total_lines, desc="Memproses data"):\n        cleaned_row = [ILLEGAL_CHARACTERS_RE.sub("", cell) for cell in row]\n        ws.append(cleaned_row)\n\nwb.save(output_file)\nprint(f"✅ Proses selesai! File berhasil disimpan di: {output_file}")\n'

## Buka CSV Dengan Mengabaikan Baris Rusak

In [None]:
# coba dengan utf-8 (paling umum)
df = pd.read_csv(
    "Documents/Data Hanif/250501_DATA_BPJS_MEI_2025_BY_NAME.csv", 
    sep="|", 
    encoding="utf-8", 
    on_bad_lines="skip"   # kalau ada baris rusak, dilewati
)
df.head(10)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
,RangeIndex: 16202897 entries, 0 to 16202896
,Data columns (total 15 columns):
, #   Column                 Dtype  
,---  ------                 -----  
, 0   kepwil_id              int64  
, 1   kepwil_psnoka          int64  
, 2   kepwil_noka            int64  
, 3   kepwil_kcfaskes        object 
, 4   kepwil_dati2faskes     object 
, 5   kepwil_nmppk           object 
, 6   kepwil_nama            object 
, 7   kepwil_jenkel          int64  
, 8   kepwil_tgllhr          object 
, 9   kepwil_pisa            object 
, 10  kepwil_nmdesa          object 
, 11  kepwil_nmkec           object 
, 12  kepwil_nmdati2_alamat  object 
, 13  kepwil_tgl_proses      object 
, 14  kepwil_nik             float64
,dtypes: float64(1), int64(4), object(10)
,memory usage: 1.8+ GB


In [18]:
df.shape

(16202897, 15)

In [25]:
df[df.duplicated('kepwil_nik')].shape

(16751, 15)

In [16]:
df[df.duplicated('kepwil_psnoka')].shape

(0, 15)

In [17]:
df[df.duplicated('kepwil_psnoka')].head(10)

Unnamed: 0,kepwil_id,kepwil_psnoka,kepwil_noka,kepwil_kcfaskes,kepwil_dati2faskes,kepwil_nmppk,kepwil_nama,kepwil_jenkel,kepwil_tgllhr,kepwil_pisa,kepwil_nmdesa,kepwil_nmkec,kepwil_nmdati2_alamat,kepwil_tgl_proses,kepwil_nik
