In [2]:
import pandas as pd
import re

# Baca isi file sebagai teks mentah
file_path = "data/sgb_data.csv"
with open(file_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()

# Buang header
data_lines = raw_lines[1:]

parsed_rows = []

for line in data_lines:
    line = line.strip()

    # Format dengan kutip ganda (""alamat"")
    match1 = re.match(r'^"?(.*?),""(.*?)"",(\d+),([\d\-:\s]+)"?$', line)
    if match1:
        nama_sgb, alamat, slot, timestamp = match1.groups()
    else:
        # Format tanpa kutip ganda (alamat langsung)
        match2 = re.match(r'^(.*?),(.*?),(\d+),([\d\-:\s]+)$', line)
        if match2:
            nama_sgb, alamat, slot, timestamp = match2.groups()
        else:
            continue  # skip baris yang gagal semua

    parsed_rows.append({
        "Nama SGB": nama_sgb.strip(),
        "Alamat": alamat.strip(),
        "Jumlah Slot": int(slot),
        "Timestamp": timestamp.strip()
    })

# Buat DataFrame
df = pd.DataFrame(parsed_rows)

# Simpan ke file CSV jika ingin
df.to_csv("data/sgb_data_cleaned.csv", index=False)
# df.to_excel("sgb_data_cleaned.xlsx", index=False)

print(f"Total parsed rows: {len(df)}")
print(df.head())


Total parsed rows: 238
                          Nama SGB  \
0                 Volta Purwokerto   
1                 KFC Artha Gading   
2            Alfamart Duren Tiga 2   
3       Agapindo Cikarang (24 jam)   
4  Alfamidi Raden Saleh 3 (24 Jam)   

                                              Alamat  Jumlah Slot  \
0  Jl. Jend. Sudirman No.400, Kranjimuntang, Purw...            4   
1  Sentra Bisnis, Jl, Jl. Artha Gading Sel. No.Ka...            8   
2  Jl. Duren Tiga Raya No.88, RT.2/RW.001, Pancor...           12   
3  Ruko Blok F17 CBD Cikarang (Pasir Sari, Cikara...           12   
4  JL. KAV.DKI BLOK X/22 005/01, MERUYA UTARA, KE...           12   

             Timestamp  
0  2025-06-06 22:37:09  
1  2025-06-06 22:36:14  
2  2025-06-06 22:34:19  
3  2025-06-06 22:38:41  
4  2025-06-06 22:43:22  


In [4]:
import pandas as pd

# Load data hasil pembersihan
df = pd.read_csv("data/sgb_data_cleaned.csv")

# Filter baris di mana kolom 'Alamat' mengandung "jak" (case-insensitive)
df_jakarta = df[df["Alamat"].str.lower().str.contains("jak")]

# Simpan hasil filter ke file CSV baru
df_jakarta.to_csv("data/sgb_jakarta.csv", index=False)

# (Opsional) Tampilkan ringkasan hasil
print(f"Total data dengan alamat mengandung 'jak': {len(df_jakarta)}")
print(df_jakarta.head())


Total data dengan alamat mengandung 'jak': 87
                                Nama SGB  \
1                       KFC Artha Gading   
2                  Alfamart Duren Tiga 2   
4        Alfamidi Raden Saleh 3 (24 Jam)   
6            Alfamart Kramat Pulo Gundul   
7  Alfamidi Tanjung Duren Timur (24 Jam)   

                                              Alamat  Jumlah Slot  \
1  Sentra Bisnis, Jl, Jl. Artha Gading Sel. No.Ka...            8   
2  Jl. Duren Tiga Raya No.88, RT.2/RW.001, Pancor...           12   
4  JL. KAV.DKI BLOK X/22 005/01, MERUYA UTARA, KE...           12   
6  Jl. Kramat Pulo Gundul Kel No.K.9, RT.1/RW.9, ...           12   
7  Jl. Letjen M.T. Haryono, RT.11/RW.5, Tebet Bar...           12   

             Timestamp  
1  2025-06-06 22:36:14  
2  2025-06-06 22:34:19  
4  2025-06-06 22:43:22  
6  2025-06-06 22:24:18  
7  2025-06-06 22:54:06  


In [7]:
import pandas as pd

# Load file CSV
df = pd.read_csv("data/sgb_jakarta.csv")

# Cek duplikat berdasarkan Nama SGB
duplikat_nama = df[df.duplicated(subset=["Nama SGB"], keep=False)]

# Cek duplikat berdasarkan Alamat
duplikat_alamat = df[df.duplicated(subset=["Alamat"], keep=False)]

# Gabungkan dan hapus duplikat baris antar keduanya
gabungan_duplikat = pd.concat([duplikat_nama, duplikat_alamat]).drop_duplicates()

# Tampilkan hasil
print("=== Duplikat Nama SGB ===")
print(duplikat_nama)

print("\n=== Duplikat Alamat ===")
print(duplikat_alamat)

print("\n=== Gabungan Duplikat Nama SGB atau Alamat ===")
print(gabungan_duplikat)


=== Duplikat Nama SGB ===
Empty DataFrame
Columns: [Nama SGB, Alamat, Jumlah Slot, Timestamp]
Index: []

=== Duplikat Alamat ===
                                 Nama SGB  \
4   Alfamidi Tanjung Duren Timur (24 Jam)   
5                     CGYH120115240900052   
9                   Alfamart Lodan Center   
11            Mangkuluhur City 5 (24 Jam)   
16                      Alfamart RS Polri   
27                  Alfamart Kapuk Raya 4   
29             Alfamart Daan Mogot Estate   
30           SPBKLU PLN UP3 Lenteng Agung   
33            Mangkuluhur City 3 (24 Jam)   
39              Alfamart Daan Mogot Prima   
49                    CGYH120115240900006   
57              Alfamart Dewi Sartika 120   
61      Alfamart Paus Rawamangun (24 Jam)   
73                        SPBU MT Haryono   
75                        PLN UP3 Serpong   
82                           SPBU Pramuka   

                                               Alamat  Jumlah Slot  \
4   Jl. Letjen M.T. Haryono, RT.11/

In [9]:
import pandas as pd
import re

# Baca isi file sebagai teks mentah
file_path = "data/sgb_jakarta_no_duplicate.csv"
with open(file_path, "r", encoding="utf-8") as f:
    raw_lines = f.readlines()

# Buang header
data_lines = raw_lines[1:]

parsed_rows = []

for line in data_lines:
    line = line.strip()

    # Format dengan kutip ganda (""alamat"")
    match1 = re.match(r'^"?(.*?),""(.*?)"",(\d+),([\d\-:\s]+)"?$', line)
    if match1:
        nama_sgb, alamat, slot, timestamp = match1.groups()
    else:
        # Format tanpa kutip ganda (alamat langsung)
        match2 = re.match(r'^(.*?),(.*?),(\d+),([\d\-:\s]+)$', line)
        if match2:
            nama_sgb, alamat, slot, timestamp = match2.groups()
        else:
            continue  # skip baris yang gagal semua

    parsed_rows.append({
        "Nama SGB": nama_sgb.strip(),
        "Alamat": alamat.strip(),
        "Jumlah Slot": int(slot),
        "Timestamp": timestamp.strip()
    })

# Buat DataFrame
df = pd.DataFrame(parsed_rows)

# Simpan ke file CSV jika ingin
df.to_csv("data/sgb_jakarta_cleaned_no_duplicate.csv", index=False)
# df.to_excel("sgb_data_cleaned.xlsx", index=False)

print(f"Total parsed rows: {len(df)}")
print(df.head())


Total parsed rows: 84
                                Nama SGB  \
0                       KFC Artha Gading   
1                  Alfamart Duren Tiga 2   
2        Alfamidi Raden Saleh 3 (24 Jam)   
3            Alfamart Kramat Pulo Gundul   
4  Alfamidi Tanjung Duren Timur (24 Jam)   

                                              Alamat  Jumlah Slot  \
0  Sentra Bisnis, Jl, Jl. Artha Gading Sel. No.Ka...            8   
1  Jl. Duren Tiga Raya No.88, RT.2/RW.001, Pancor...           12   
2  JL. KAV.DKI BLOK X/22 005/01, MERUYA UTARA, KE...           12   
3  Jl. Kramat Pulo Gundul Kel No.K.9, RT.1/RW.9, ...           12   
4  Jl. Tanjung Duren Dalam VI No.2a, RT.7/RW.3, T...           12   

             Timestamp  
0  2025-06-06 22:36:14  
1  2025-06-06 22:34:19  
2  2025-06-06 22:43:22  
3  2025-06-06 22:24:18  
4  2025-06-06 22:54:06  


In [10]:
import pandas as pd

# Load file CSV
df = pd.read_csv("data/sgb_jakarta_cleaned_no_duplicate.csv")

# Cek duplikat berdasarkan Nama SGB
duplikat_nama = df[df.duplicated(subset=["Nama SGB"], keep=False)]

# Cek duplikat berdasarkan Alamat
duplikat_alamat = df[df.duplicated(subset=["Alamat"], keep=False)]

# Gabungkan dan hapus duplikat baris antar keduanya
gabungan_duplikat = pd.concat([duplikat_nama, duplikat_alamat]).drop_duplicates()

# Tampilkan hasil
print("=== Duplikat Nama SGB ===")
print(duplikat_nama)

print("\n=== Duplikat Alamat ===")
print(duplikat_alamat)

print("\n=== Gabungan Duplikat Nama SGB atau Alamat ===")
print(gabungan_duplikat)


=== Duplikat Nama SGB ===
Empty DataFrame
Columns: [Nama SGB, Alamat, Jumlah Slot, Timestamp]
Index: []

=== Duplikat Alamat ===
                       Nama SGB  \
10  Mangkuluhur City 5 (24 Jam)   
32  Mangkuluhur City 3 (24 Jam)   

                                               Alamat  Jumlah Slot  \
10  Jl. Gatot Subroto No.Kav. 1, RT.1/RW.3, Karet ...           12   
32  Jl. Gatot Subroto No.Kav. 1, RT.1/RW.3, Karet ...           12   

              Timestamp  
10  2025-06-06 22:27:50  
32  2025-06-06 22:27:34  

=== Gabungan Duplikat Nama SGB atau Alamat ===
                       Nama SGB  \
10  Mangkuluhur City 5 (24 Jam)   
32  Mangkuluhur City 3 (24 Jam)   

                                               Alamat  Jumlah Slot  \
10  Jl. Gatot Subroto No.Kav. 1, RT.1/RW.3, Karet ...           12   
32  Jl. Gatot Subroto No.Kav. 1, RT.1/RW.3, Karet ...           12   

              Timestamp  
10  2025-06-06 22:27:50  
32  2025-06-06 22:27:34  


In [12]:
import pandas as pd

# 1. Baca file Excel
df_raw = pd.read_excel("data/sgb_jakarta_location.xlsx")

# 2. Pisahkan kolom gabungan berdasarkan koma
df_split = df_raw.iloc[:, 0].str.split(",", n=2, expand=True)
df_split.columns = ["Nama SGB", "Latitude", "Longitude"]

# 3. Bersihkan data
df_split["Nama SGB"] = df_split["Nama SGB"].str.strip()
df_split["Latitude"] = df_split["Latitude"].astype(float)
df_split["Longitude"] = df_split["Longitude"].astype(float)

# 4. Simpan ke file CSV (opsional)
df_split.to_csv("data/sgb_jakarta_location_clean.csv", index=False)

# 5. (Opsional) Tampilkan hasil
print(df_split.head())


                                Nama SGB  Latitude   Longitude
0                       KFC Artha Gading -6.148338  106.893064
1                  Alfamart Duren Tiga 2 -6.253411  106.837603
2        Alfamidi Raden Saleh 3 (24 Jam) -6.197597  106.725640
3            Alfamart Kramat Pulo Gundul -6.182556  106.845871
4  Alfamidi Tanjung Duren Timur (24 Jam) -6.182056  106.787153


In [14]:
import pandas as pd

# 1. Baca kedua file CSV
df_location = pd.read_csv("data/sgb_jakarta_location_clean.csv")
df_cleaned = pd.read_csv("data/sgb_jakarta_cleaned_no_duplicate.csv")

# 2. Gabungkan berdasarkan kolom "Nama SGB" (inner join)
df_merged = pd.merge(df_cleaned, df_location, on="Nama SGB", how="inner")

# 3. Simpan hasil gabungan ke file baru
df_merged.to_csv("data/sgb_jakarta_completed.csv", index=False)

# 4. (Opsional) Tampilkan ringkasan hasil
print(f"Total baris hasil gabungan: {len(df_merged)}")
print(df_merged.head())


Total baris hasil gabungan: 81
                                Nama SGB  \
0                       KFC Artha Gading   
1                  Alfamart Duren Tiga 2   
2        Alfamidi Raden Saleh 3 (24 Jam)   
3            Alfamart Kramat Pulo Gundul   
4  Alfamidi Tanjung Duren Timur (24 Jam)   

                                              Alamat  Jumlah Slot  \
0  Sentra Bisnis, Jl, Jl. Artha Gading Sel. No.Ka...            8   
1  Jl. Duren Tiga Raya No.88, RT.2/RW.001, Pancor...           12   
2  JL. KAV.DKI BLOK X/22 005/01, MERUYA UTARA, KE...           12   
3  Jl. Kramat Pulo Gundul Kel No.K.9, RT.1/RW.9, ...           12   
4  Jl. Tanjung Duren Dalam VI No.2a, RT.7/RW.3, T...           12   

             Timestamp  Latitude   Longitude  
0  2025-06-06 22:36:14 -6.148338  106.893064  
1  2025-06-06 22:34:19 -6.253411  106.837603  
2  2025-06-06 22:43:22 -6.197597  106.725640  
3  2025-06-06 22:24:18 -6.182556  106.845871  
4  2025-06-06 22:54:06 -6.182056  106.787153  
