In [None]:
import pandas as pd

# Test data dengan quality issues
test_data = pd.DataFrame([
    {"sensor_id": "SENS001", "truck_id": "TRK001", "temp": -999, "humidity": 120},  # Invalid values
    {"sensor_id": "SENS002", "truck_id": "", "temp": 25.5, "humidity": None},       # Missing values  
    {"sensor_id": "SENS001", "truck_id": "TRK001", "temp": 25.5, "humidity": 65},   # Duplicate
    {"sensor_id": "SENS001", "truck_id": "TRK001", "temp": 25.5, "humidity": 65}    # Duplicate
])

ModuleNotFoundError: No module named 'pandas'

In [None]:
class DataQualityChecker:
    def __init__(self):
        self.violations = []

    def _add(self, vtype, message, details=None):
        self.violations.append({"type": vtype, "message": message, "details": details})

    def check_missing_values(self, df, required_columns):
        """Count missing (NaN) or empty-string values in required columns."""
        for col in required_columns:
            if col not in df.columns:
                self._add("missing_column", f"Column not found: {col}", {"column": col})
                continue
            # consider NaN or empty/whitespace string as missing
            missing_mask = df[col].isna() | (df[col].astype(str).str.strip() == "")
            count = int(missing_mask.sum())
            if count > 0:
                samples = df[missing_mask].head(5).to_dict(orient="records")
                self._add("missing_values", f"{count} missing in column {col}", {"column": col, "count": count, "samples": samples})

    def check_value_ranges(self, df):
        """Check temperature and humidity ranges (temperature: -40..80, humidity: 0..100)."""
        # detect candidate columns
        temp_col = next((c for c in df.columns if c.lower() in ("temp", "temperature", "t")), None)
        hum_col  = next((c for c in df.columns if c.lower() in ("humidity", "hum", "h")), None)

        if temp_col:
            temp_num = pd.to_numeric(df[temp_col], errors="coerce")
            # invalid if non-numeric (but original non-null) or out of range
            invalid_mask = (~temp_num.notna() & df[temp_col].notna()) | (temp_num < -40) | (temp_num > 80)
            count = int(invalid_mask.sum())
            if count > 0:
                samples = df[invalid_mask].head(5).to_dict(orient="records")
                self._add("value_range", f"{count} invalid temperature values in {temp_col}", {"column": temp_col, "count": count, "samples": samples})

        if hum_col:
            hum_num = pd.to_numeric(df[hum_col], errors="coerce")
            invalid_mask = (~hum_num.notna() & df[hum_col].notna()) | (hum_num < 0) | (hum_num > 100)
            count = int(invalid_mask.sum())
            if count > 0:
                samples = df[invalid_mask].head(5).to_dict(orient="records")
                self._add("value_range", f"{count} invalid humidity values in {hum_col}", {"column": hum_col, "count": count, "samples": samples})

    def check_duplicates(self, df, key_columns):
        """Find duplicate rows based on key_columns (keep=False => mark all in duplicate groups)."""
        missing = [c for c in key_columns if c not in df.columns]
        if missing:
            self._add("missing_column", f"Key columns missing: {missing}", {"missing": missing})
            return
        dup_mask = df.duplicated(subset=key_columns, keep=False)
        count = int(dup_mask.sum())
        if count > 0:
            # group duplicate keys and counts
            groups = df[dup_mask].groupby(key_columns).size().reset_index(name="dup_count").to_dict(orient="records")
            samples = df[dup_mask].head(10).to_dict(orient="records")
            self._add("duplicates", f"{count} duplicate rows based on {key_columns}", {"count": count, "groups": groups, "samples": samples})

    def generate_report(self):
        """Return report dict summarizing violations."""
        summary = {
            "total_violations": len(self.violations),
            "violations": self.violations
        }
        return summary

In [None]:
# Test the checker
checker = DataQualityChecker()
checker.check_missing_values(test_data, ['sensor_id', 'truck_id'])
checker.check_value_ranges(test_data)
checker.check_duplicates(test_data, ['sensor_id', 'truck_id'])
report = checker.generate_report()
print(json.dumps(report, indent=2))