# Fix invalid format in ARFF file and convert to CSV

#### Origin data is in ARFF format.
#### This notebook fix error in origin data, fix null and convert to CSV format

## 1. Import and add data_path

In [122]:
 import pandas as pd
from scipy.io import arff
import numpy as np

In [123]:
input_arff = "../dataset/chronic_kidney_disease_full.arff"
fixed_arff = "../dataset/chronic_kidney_disease_clean.arff"
input_csv = "../dataset/chronic_kidney_disease_full.csv"

## 2. Fix invalid format in ARFF file

In [124]:
header_lines = []
data_lines = []
in_data = False

with open(input_arff, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:

        # Load dataset
        if in_data:

            # None line
            if not line.strip():
                continue

            parts = [p.strip() for p in line.split(",") if p.strip() != ""]
            parts = [
                "yes" if p.lower() == "yes" else
                "no" if p.lower() == "no" else
                p
                for p in parts
            ]
            # if len(parts) > 25:
            #     parts = parts[:25]
            # elif len(parts) < 25:
            #     parts += ["?"] * (25 - len(parts))

            data_lines.append(",".join(parts) + "\n")

        else:
            header_lines.append(line)
            if line.lower().startswith("@data"):
                in_data = True
                continue

with open(fixed_arff, "w", encoding="utf-8") as f:
    f.writelines(header_lines + data_lines)

data, meta = arff.loadarff(fixed_arff)
print(len(data), meta.names())
print(data)


400 ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']
[(48.,  80., b'1.020', b'1', b'0', b'?', b'normal', b'notpresent', b'notpresent', 121.,  36. ,  1.2 ,   nan,  nan, 15.4, 44.,  7800., 5.2, b'yes', b'yes', b'no', b'good', b'no', b'no', b'ckd')
 ( 7.,  50., b'1.020', b'4', b'0', b'?', b'normal', b'notpresent', b'notpresent',  nan,  18. ,  0.8 ,   nan,  nan, 11.3, 38.,  6000., nan, b'no', b'no', b'no', b'good', b'no', b'no', b'ckd')
 (62.,  80., b'1.010', b'2', b'3', b'normal', b'normal', b'notpresent', b'notpresent', 423.,  53. ,  1.8 ,   nan,  nan,  9.6, 31.,  7500., nan, b'no', b'yes', b'no', b'poor', b'no', b'yes', b'ckd')
 (48.,  70., b'1.005', b'4', b'0', b'normal', b'abnormal', b'present', b'notpresent', 117.,  56. ,  3.8 , 111. ,  2.5, 11.2, 32.,  6700., 3.9, b'yes', b'no', b'no', b'poor', b'yes', b'yes', b'ckd')
 (51.,  80., b'1.010', b'2', b'0', b'normal

## 3. Convert fixed ARFF to CSV format

In [125]:
# Load ARFF file and convert to pd
data, meta = arff.loadarff(fixed_arff)
df = pd.DataFrame(data)

# Decode bytes
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].apply(
            lambda x: x.decode("utf-8") if isinstance(x, bytes) else x
        )

# Process "?" and numeric value
df.replace("?", np.nan, inplace=True)
# df["sg"] = pd.to_numeric(df["sg"], errors="coerce")
# df["al"] = pd.to_numeric(df["al"], errors="coerce")
# df["su"] = pd.to_numeric(df["su"], errors="coerce")

# Load to CSV
df.to_csv(input_csv, index=True, index_label="id")