# Menggabungkan beberapa dataframe menjadi satu dataframe

### Oleh: Aditya Hanif

## Melihat direktori file yang akan digunakan

In [6]:
import glob
file = glob.glob("Documents/Data Hanif/Februari/*.xlsx")
file

['Documents/Data Hanif/Februari/1748405918_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748320901_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748415554_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1753749493_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1747879519_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1739500021_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1751509063_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748404958_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1749689947_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748828403_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748227500_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748826902_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748319049_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748333914_data_kabkota_202502.xlsx',
 'Documents/Data Han

## Memasukkan file tersebut ke dalam satu variabel paths

In [1]:
paths = ['Documents/Data Hanif/Februari/1748405918_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748320901_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748415554_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1753749493_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1747879519_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1739500021_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1751509063_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748404958_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1749689947_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748828403_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748227500_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748826902_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748319049_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748333914_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1753673260_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748418639_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1747879170_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1750122375_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/250200_DATA_KABKOTA_FEBRUARI_2025_BY_NAME.xlsx',
 'Documents/Data Hanif/Februari/1748173915_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1753686246_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748418083_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1741068883_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748311108_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748413581_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748415492_data_kabkota_202502.xlsx',
 'Documents/Data Hanif/Februari/1748333552_data_kabkota_202502.xlsx']

## Membuat fungsi untuk import & cleaning dataframe

In [2]:
import pandas as pd
import numpy as np

def ambil_dan_bersihkan(path):
    # --- Load data ---
    df = pd.read_excel(
        path,
        dtype={"NIK": str, "PSNOKA": str, "NOKA": str}
    )
    
    df_kab = df.copy()

    # --- Drop rows where PSNOKA == 1 ---
    if "PSNOKA" in df_kab.columns:
        df_kab = df_kab[~df_kab["PSNOKA"].isin([1, "1"])]

    # --- Rename columns if available ---
    rename_map = {}
    if "Nama" in df_kab.columns:
        rename_map["Nama"] = "NAMA"
    if "TglLahir" in df_kab.columns:
        rename_map["TglLahir"] = "TGLLAHIR"
    if "TglLhr" in df_kab.columns:
        rename_map["TglLhr"] = "TGLLAHIR"
    if "tgl lahir" in df_kab.columns:
        rename_map["tgl lahir"] = "TGLLAHIR"        
    if "Dati2Alamat" in df_kab.columns:
        rename_map["Dati2Alamat"] = "ALAMAT"
    if "Dati2Faskes" in df_kab.columns:
        rename_map["Dati2Faskes"] = "KAB"
   
    df_kab = df_kab.rename(columns=rename_map)

    # --- Format NIK ---
    if "NIK" in df_kab.columns:
        def format_nik(val):
            if pd.isna(val) or str(val).strip() == "":
                return "-"
            val = str(val)
            if val.startswith(""):
                return val  # already formatted
            return "" + val
        df_kab["NIK"] = df_kab["NIK"].apply(format_nik)

    # --- Format PSNOKA ---
    if "PSNOKA" in df_kab.columns:
        def format_psnoka(val):
            if pd.isna(val) or str(val).strip() == "":
                return "-"
            val = str(val)
            if val.startswith(""):
                return val
            return "" + val
        df_kab["PSNOKA"] = df_kab["PSNOKA"].apply(format_psnoka)

    # --- Format TGLLAHIR (robust) ---
    df_kab["TGLLAHIR"] = pd.to_datetime(df_kab["TGLLAHIR"], errors="coerce").dt.date.astype(str)
    df_kab["TGLLAHIR"] = np.where(df_kab["TGLLAHIR"].isnull(), "-", df_kab["TGLLAHIR"])

    # --- Ambil kolom utama saja (jika ada) ---
    kolom_final = ['KAB','NIK', 'PSNOKA', 'NAMA', 'TGLLAHIR', 'ALAMAT']
    kolom_ada = [k for k in kolom_final if k in df_kab.columns]

    df_kabupaten = df_kab[kolom_ada]
    print(f"Memproses file: {path}")
    return df_kabupaten

## Melakukan cleaning dataframe secara massal dengan fungsi yang sudah dibuat

In [3]:
import os

# Inisialisasi dictionary kosong untuk menyimpan dataframe
dataframes = {}

# Pastikan folder DF ada
os.makedirs("Documents/Data Hanif/Februari/DF", exist_ok=True)

for path in paths:
    try:
        # Ambil nama file tanpa ekstensi untuk nama variabel
        base_name = os.path.basename(path)           # misal '1754377752_data_kabkota_202505.xlsx'
        nama_var = os.path.splitext(base_name)[0]   # misal '1754377752_data_kabkota_202505'
        nama_var = nama_var.replace(' ', '_').replace('.', '_').lower()

        # Panggil fungsi dengan path
        df = ambil_dan_bersihkan(path)

        # Simpan di dictionary
        dataframes[nama_var] = df

        # Buat variabel global df_<nama_var>
        globals()[f"df_{nama_var}"] = df

        # Tentukan nama file CSV
        namafile = f"Documents/Data Hanif/Februari/DF/{nama_var}.csv"

        # Simpan CSV
        df.to_csv(namafile, sep="|", index=False)

        print(f"Berhasil memproses: {path}")

    except Exception as e:
        # Lewati file yang error
        print(f"Gagal memproses {path}: {e}")


Memproses file: Documents/Data Hanif/Februari/1748405918_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748405918_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1748320901_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748320901_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1748415554_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748415554_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1753749493_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1753749493_data_kabkota_202502.xlsx
Gagal memproses Documents/Data Hanif/Februari/1747879519_data_kabkota_202502.xlsx: Can't find workbook in OLE2 compound document
Memproses file: Documents/Data Hanif/Februari/1739500021_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1739500021_data_kabkota_202502.xlsx
Memproses file: Documents/Data 

  df_kab["TGLLAHIR"] = pd.to_datetime(df_kab["TGLLAHIR"], errors="coerce").dt.date.astype(str)


Memproses file: Documents/Data Hanif/Februari/1748826902_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748826902_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1748319049_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748319049_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1748333914_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748333914_data_kabkota_202502.xlsx
Gagal memproses Documents/Data Hanif/Februari/1753673260_data_kabkota_202502.xlsx: 'TGLLAHIR'
Memproses file: Documents/Data Hanif/Februari/1748418639_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1748418639_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1747879170_data_kabkota_202502.xlsx
Berhasil memproses: Documents/Data Hanif/Februari/1747879170_data_kabkota_202502.xlsx
Memproses file: Documents/Data Hanif/Februari/1750122375_data_kabk

## Menampilkan dataframe yang telah dibersihkan

In [4]:
# Daftar nama dataframe yang berhasil dibuat
print("Dataframe yang berhasil diproses:")
for nama in dataframes.keys():
    print(f"df_{nama},")

Dataframe yang berhasil diproses:
df_1748405918_data_kabkota_202502,
df_1748320901_data_kabkota_202502,
df_1748415554_data_kabkota_202502,
df_1753749493_data_kabkota_202502,
df_1739500021_data_kabkota_202502,
df_1751509063_data_kabkota_202502,
df_1748404958_data_kabkota_202502,
df_1749689947_data_kabkota_202502,
df_1748828403_data_kabkota_202502,
df_1748227500_data_kabkota_202502,
df_1748826902_data_kabkota_202502,
df_1748319049_data_kabkota_202502,
df_1748333914_data_kabkota_202502,
df_1748418639_data_kabkota_202502,
df_1747879170_data_kabkota_202502,
df_1750122375_data_kabkota_202502,
df_250200_data_kabkota_februari_2025_by_name,
df_1748173915_data_kabkota_202502,
df_1753686246_data_kabkota_202502,
df_1748418083_data_kabkota_202502,
df_1748311108_data_kabkota_202502,
df_1748413581_data_kabkota_202502,
df_1748415492_data_kabkota_202502,
df_1748333552_data_kabkota_202502,


## Menyatukan semua dataframe ke dalam satu dataframe

In [5]:
df_all_feb = pd.concat(dataframes.values(), ignore_index=True)
df_all_feb

Unnamed: 0,KAB,NIK,PSNOKA,NAMA,TGLLAHIR,ALAMAT
0,KAB.GARUT,3205042010920001,0000003043113,ALFAN NUGRAHA,1992-10-20,"KP. PANGKALAN KIDUL 2/3, PANANJUNG, TAROGONG K..."
1,KAB.GARUT,3205330502020007,0002469401471,ASIAH NUR AMANAH,2002-02-05,"KP. PANGKALAN KIDUL 2/3, PANANJUNG, TAROGONG K..."
2,KAB.GARUT,3205046302190001,0002748797864,KEISHA ANINDYA NUGRAHA,2019-02-23,"KP. PANGKALAN KIDUL 2/3, PANANJUNG, TAROGONG K..."
3,KAB.GARUT,3205044309240001,0003622181411,QIANA WAFA YASMIN NUGRAHA,2024-09-03,"KP. PANGKALAN KIDUL 2/3, PANANJUNG, TAROGONG K..."
4,KAB.GARUT,3205065609980002,0000005766153,LINA HERLIANI,1998-09-16,"KP. BABAKAN SAWAH BERA 1/10, CIMAREME, BANYURE..."
...,...,...,...,...,...,...
7434600,KAB.SUMEDANG,-,0003641067202,BAYI NYONYA AI ROHIMAH,NaT,"DUSUN JAGANDALA 2/1, CIJAMBU, TANJUNGSARI, KAB..."
7434601,KAB.SUMEDANG,3211165606240001,0003639895716,ELISYA AZZAHRA,NaT,"DUSUN SINDANG 1/2, RANCAKALONG, RANCAKALONG, K..."
7434602,KAB.SUMEDANG,-,0003639196113,BAYI NYONYA PINA PITRIYANI,NaT,"DUSUN BABAKAN SUKAMULYA 2/6, CIPACING, JATINAN..."
7434603,KAB.SUMEDANG,3211012802210001,0003639891396,NANDIKA RAIKAL FAHREZI,NaT,"DUSUN CISURAT 4/1, CISURAT, WADO, KAB. SUMEDANG"


## Ekspor file dataframe yang telah digabung

In [8]:
df_all_feb.to_csv('Documents/Data Hanif/Februari/DF/BPJS Feb.csv', sep='|', index=False)

In [9]:
import glob
file = glob.glob("Documents/Data Hanif/April/DF/*.csv")
file

['Documents/Data Hanif/April/DF/1749709644_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1749700570_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1750122441_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1750081161_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1748420985_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1747879239_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1748334024_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1748828451_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1753924678_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1748333608_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1750822158_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1749531710_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1748416325_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/1748173986_data_kabkota_202504.csv',
 'Documents/Data Hanif/April/DF/17