## Libraries Import

In [None]:
import os
import random

# Data Analysis
import pandas as pd
import polars as pl
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Stats & ML

from sklearn.ensemble import IsolationForest



pl.Config.set_tbl_rows(-1)
%matplotlib inline

## Data Loading

In [None]:
try:
    df = pl.read_csv('../exports/cummulative_daily_consumption.csv', infer_schema_length=100000)
    print(f"Data loaded successfully")
except Exception as e:
    print(f"An error occurred while loading the data: {e}")

In [None]:
df.head()

In [None]:
# cast date string to date datatype
df = df.with_columns(pl.col("DATE").cast(pl.Date))

In [None]:
df.head()

# Data Quality Assessment

Create a temporary daily consumption as it will be used in validity check.

In [None]:
daily_consumption=df.with_columns(
        (pl.col("CUMMULATIVE_CONSUMPTION") - pl.col("CUMMULATIVE_CONSUMPTION").shift(1))
        .over("DEVICE_ID")
        .alias("DAILY_CONSUMPTION")
    ).filter(pl.col("DAILY_CONSUMPTION").is_not_null())

### Validity

Validity is strictly the business rule checks on the dataset. 

Eqn: If record = 0 or daily consumption is < 1, then record invalid. Because it is a cummulative consumption, so the daily consumption must always increase, or at the worst case, remain stagnant if there are no consumption.

It also applies to the cummulative consumption, it must always accummulative and not drop to zero. If it does, it can signify device reset or repair.

In [None]:
# Step 1: Flag invalid rows based on business rules
daily_consumption = daily_consumption.with_columns(
    pl.when(
        (pl.col("CUMMULATIVE_CONSUMPTION") == 0) | (pl.col("DAILY_CONSUMPTION") < 0)
    )
    .then(0)  # Flag as False / 0 if invalid
    .otherwise(1)  # Otherwise True / 1
    .alias("VALIDITY")
)
daily_consumption.head()

In [None]:
daily_consumption['VALIDITY'].value_counts()

#### Visualization

In [None]:
invalid_devices = daily_consumption.filter(pl.col('VALIDITY') == 0)["DEVICE_ID"].unique()
device_ids = random.sample(list(invalid_devices), min(10, len(invalid_devices)))

In [None]:
for device_id in device_ids:
    device_data = daily_consumption.filter(pl.col('DEVICE_ID')==device_id)
    plt.figure(figsize=(12, 6))
    for validity_, group in device_data.group_by("VALIDITY"):
        validity = validity_[0]
        label = f"{device_id} - {'Valid' if validity == 1 else 'Invalid'}"
        plt.plot(
            group["DATE"],
            group["CUMMULATIVE_CONSUMPTION"],
            marker="o",
            linestyle="-",
            label=label,
            color="green" if validity == 1 else "red",
        )

    # Add plot details
    plt.title("Cumulative Consumption with Validity Highlight (Random Devices)", fontsize=14)
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Cumulative Consumption", fontsize=12)
    plt.legend(title="Validity", fontsize=10)
    plt.grid(True, linestyle="--", alpha=0.7)
    plt.tight_layout()

    # Show the plot
    plt.show()

### Timeliness

Timeliness is to measure if the devices sends data at the required time. The IAT Regularity is used for this.

In [None]:
daily_consumption.shape[0]

In [None]:
daily_consumption.head()

In [None]:
def compute_timeliness_metric_polars(df: pl.DataFrame, device_col="DEVICE_ID", date_col="DATE"):
    """
    Computes the Inter-Arrival Time Regularity metric for timeliness assessment using Polars.
    
    Parameters:
        df (pl.DataFrame): The input dataframe containing timestamps and device IDs.
        device_col (str): The name of the device ID column.
        date_col (str): The name of the date column.

    Returns:
        pl.DataFrame: A dataframe containing the timeliness metric per record.
    """
    
    # Compute inter-arrival time (IAT) in days for each device
    df = df.sort([device_col, date_col])  
    df = df.with_columns(
        (pl.col(date_col).diff().dt.total_days().over(device_col)).alias("IAT")
    )

    # Fill NaN values (first row per device) with 1 (since mode is 1)
    df = df.fill_null(1)

    # Compute Relative Absolute Error (RAE) assuming mode = 1 day
    df = df.with_columns(
        pl.col("IAT").sub(1).abs().alias("RAE")  # Fixed absolute value computation
    )

    # Compute goodness and penalty
    df = df.with_columns(
        pl.when(pl.col("RAE") <= 0.5)
        .then(1 - 2 * pl.col("RAE"))
        .otherwise(0)
        .alias("goodness"),
        pl.when(pl.col("RAE") > 0.5)
        .then(2 * pl.col("RAE"))
        .otherwise(0)
        .alias("penalty"),
    )

    # Compute the timeliness score per record
    df = df.with_columns(
        (pl.col("goodness") / (1 + pl.col("penalty"))).cast(pl.Int8).alias("TIMELINESS")
    )

    return df.select([device_col, date_col, "IAT", "RAE", "TIMELINESS","CUMMULATIVE_CONSUMPTION","VALIDITY"])

# Compute timeliness metric
timeliness_df = compute_timeliness_metric_polars(daily_consumption)

In [None]:
timeliness_df.head()

In [None]:
timeliness_df['TIMELINESS'].value_counts()

#### Visualization - Missing data pattern

In [None]:
# Ensure output directory exists
output_dir = "../visualizations/plots"


# Filter data for Device 1187 and sort by date
device_1187 = df.filter(pl.col("DEVICE_ID") == 1187).sort("DATE")

# known missing periods from EDA
missing_periods = [
    ("2024-05-17", "2024-06-17"),
    ("2024-06-17", "2024-07-30"),
    ("2024-07-30", "2024-08-05"),
]

# Convert missing period dates to datetime
missing_periods = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in missing_periods]

# Convert resumption dates for marking missing periods
resumption_dates = [end for _, end in missing_periods]
resumption_values = device_1187.filter(pl.col("DATE").is_in(pl.Series("DATE", resumption_dates).cast(pl.Date)))["CUMMULATIVE_CONSUMPTION"]

# Plot reported values
plt.figure(figsize=(12, 6), dpi=300)  # High-resolution output
plt.plot(
    device_1187["DATE"], 
    device_1187["CUMMULATIVE_CONSUMPTION"], 
    marker="o", linestyle="-", color="blue", label="Reported Values"
)

# Highlight missing periods as shaded regions (only label first occurrence)
for i, (start, end) in enumerate(missing_periods):
    plt.axvspan(start, end, color="red", alpha=0.3, label="Long Missing Gaps" if i == 0 else "")

# Formatting
plt.xlabel("Date", fontsize=12)
plt.ylabel("Water Consumption Value", fontsize=12)
plt.title("Missing Data Patterns for Device 1187", fontsize=14)
plt.legend(fontsize=10)
plt.xticks(rotation=45)
plt.grid(True)

# Save figure in high quality for thesis inclusion
plt.savefig(f"{output_dir}/missing_data_device_1187.png", dpi=300, bbox_inches="tight")

plt.show()


In [None]:
# Step 1: Calculate Maximum IAT per Device
iat_ranking = timeliness_df.group_by("DEVICE_ID").agg(
    pl.col("IAT").max().alias("MAX_IAT")
)

In [None]:
iat_ranking.head(20)

### Accuracy

Accuracy will be based on statistical outlier detection. If a device is already invalid, it'll be removed from the accuracy outlier detection algorithm. It will use four approaches to detect outliers -> IF[3], MAD [1], and Z-Score [2].

Becuase we don't have a groundtruth to compare with, hence the need to rely on robust statistical outliers. Some may be real events, however, this information is not known. Then we will flag points that are captured by the three outlier detection techniques. If the three captures a record, then it must mean it's truly an outlier. Other uncaptured records, will be considered accurate.

1) J. Byabazaire, G. M. P. O’Hare and D. T. Delaney, "End-to-End Data Quality Assessment Using Trust for Data Shared IoT Deployments," in IEEE Sensors Journal, vol. 22, no. 20, pp. 19995-20009, 15 Oct.15, 2022, doi: 10.1109/JSEN.2022.3203853
2) Görenekli, K.; Gülba˘g, A.
Comparative Analysis of Machine Learning Techniques for Water Consumption Prediction: A Case Study from Kocaeli Province. Sensors
2024, 24, 5846. https://doi.org/ 10.3390/s24175846
3) Isolation Forest: TOWARDS EXPLAINABLE AUTOMATED DATA QUALITY ENHANCEMENT WITHOUT DOMAIN KNOWLEDGE


In [None]:

# IQR method
def detect_outliers_iqr(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return (data < lower_bound) | (data > upper_bound)

# MAD method
def detect_outliers_mad(data, threshold=3):
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    modified_z_score = 0.6745 * (data - median) / mad
    return np.abs(modified_z_score) > threshold

# Isolation Forest method
def detect_outliers_isolation_forest(data):
    iso = IsolationForest(random_state=42,contamination='auto')
    return iso.fit_predict(data.reshape(-1, 1)) == -1

In [None]:
# Outlier detection workflow for a single device
def detect_outliers_per_device(device_data):
    # Mark `VALIDITY = 0` as outliers automatically because invalid records are inaccurate automatically
    device_data = device_data.with_columns(
        pl.when(pl.col("VALIDITY") == 0)
        .then(True)  # Automatically mark as outlier
        .otherwise(False)
        .alias("FINAL_OUTLIER")
    )

    # Filter out records with VALIDITY=0 before outlier detection
    valid_data = device_data.filter(pl.col("VALIDITY") == 1)

    # Proceed only if valid data exists
    if not valid_data.is_empty():
      
        data = valid_data["CUMMULATIVE_CONSUMPTION"].to_numpy()

        # Initialize empty columns for all possible outlier methods
        valid_data = valid_data.with_columns(
            pl.lit(False).alias("OUTLIER_IQR"),
            pl.lit(False).alias("OUTLIER_MAD"),
            pl.lit(False).alias("OUTLIER_ISO"),
        )
        
        iqr_outliers = detect_outliers_iqr(data)
        mad_outliers = detect_outliers_mad(data)
        iso_outliers = detect_outliers_isolation_forest(data)
        
        valid_data = valid_data.with_columns(
            pl.Series("OUTLIER_ISO", iso_outliers),
            pl.Series("OUTLIER_MAD", mad_outliers),
            pl.Series("OUTLIER_IQR", iqr_outliers),
        )
        # Combine Z-score and IQR using logical AND
        final_outlier = np.logical_and(iqr_outliers, np.logical_and(mad_outliers, iso_outliers))

        # Update FINAL_OUTLIER for valid records
        valid_data = valid_data.with_columns(
            pl.Series("FINAL_OUTLIER", final_outlier)
        )

        # Add missing columns to invalid data with default values
        invalid_data = device_data.filter(pl.col("VALIDITY") == 0).with_columns(
            pl.lit(False).alias("OUTLIER_IQR"),
            pl.lit(False).alias("OUTLIER_MAD"),
            pl.lit(False).alias("OUTLIER_ISO"),
        )

        # Combine updated valid data with invalid data
        device_data = valid_data.vstack(invalid_data)
        
    return device_data

# Group data by DEVICE_ID and apply the workflow
result = []
for device_id, group in timeliness_df.group_by("DEVICE_ID"):
    processed_group = detect_outliers_per_device(group)
    result.append(processed_group)

# Concatenate results into a single DataFrame
result_df = pl.concat(result)

In [None]:
result_df['OUTLIER_MAD'].value_counts()

In [None]:
result_df['OUTLIER_ISO'].value_counts()

In [None]:
result_df['OUTLIER_IQR'].value_counts()

In [None]:
result_df['FINAL_OUTLIER'].value_counts()

#### Visualization

In [None]:
# Step 1: Aggregate to count outliers per device
outlier_summary = (
    result_df
    .filter(pl.col("FINAL_OUTLIER") == 1)  # Filter rows marked as outliers
    .group_by("DEVICE_ID")
    .agg(pl.count().alias("OUTLIER_COUNT"))  # Count outliers per device
    .sort("OUTLIER_COUNT", descending=True)  # Sort devices by the number of outliers
)

# Step 2: Select top N devices with the most outliers
top_n = 5  # Adjust this value as needed
top_devices = outlier_summary.head(top_n)

# Step 3: Filter data for the top devices
top_device_ids = top_devices["DEVICE_ID"].to_list()
top_device_data = result_df.filter(pl.col("DEVICE_ID").is_in(top_device_ids))


for device_id in top_device_ids:
    device_data = top_device_data.filter(pl.col("DEVICE_ID") == device_id)
    # Step 4: Visualize cumulative consumption for the top devices
    plt.figure(figsize=(12, 8))
    # Plot valid points
    plt.plot(
        device_data.filter(pl.col("FINAL_OUTLIER") == 0)["DATE"],
        device_data.filter(pl.col("FINAL_OUTLIER") == 0)["CUMMULATIVE_CONSUMPTION"],
        marker="o",
        label=f"{device_id} - Valid",
        linestyle="-",
        alpha=0.8,
    )
    
    # Plot outliers
    plt.scatter(
        device_data.filter(pl.col("FINAL_OUTLIER") == 1)["DATE"],
        device_data.filter(pl.col("FINAL_OUTLIER") == 1)["CUMMULATIVE_CONSUMPTION"],
        color="red",
        label=f"{device_id} - Outlier",
        alpha=0.8,
    )

    # Add plot details
    plt.title("Top Devices with the Most Outliers", fontsize=14)
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Cumulative Consumption", fontsize=12)
    plt.legend(fontsize=10, title="Legend")
    plt.grid(alpha=0.5, linestyle="--")
    plt.tight_layout()
    
    # Show plot
    plt.show()


In [None]:
outlier_summary.head(10)

In [None]:
result_df.head()

#### Determine Accuracy using Final Outlier

In [None]:
# Step 1: Create an Accuracy Column
result_df = result_df.with_columns(
    pl.when(pl.col("FINAL_OUTLIER") == 0)
    .then(1)  # Mark as accurate if FINAL_OUTLIER is 0
    .otherwise(0)  # Mark as inaccurate if FINAL_OUTLIER is not 0
    .alias("ACCURACY")
)

result_df.head()

#### Nullify the outliers

In [None]:
result_df = result_df.with_columns(
        pl.when(pl.col("FINAL_OUTLIER"))
        .then(None)  # Replace outliers with None (null)
        .otherwise(pl.col("CUMMULATIVE_CONSUMPTION"))
        .alias("CUMMULATIVE_CONSUMPTION")
    )

In [None]:
# Count NaN/null values in the 'DAILY_CONSUMPTION_CLEANED' column
nan_count_column = result_df.select(
    pl.col("CUMMULATIVE_CONSUMPTION").is_null().sum().alias("NaN_Count")
)
print(nan_count_column)


### Completeness

In [None]:
# Step 1: Calculate Date Range (Per Device)
date_range = result_df.group_by("DEVICE_ID").agg([
    pl.col("DATE").min().alias("START_DATE"),
    pl.col("DATE").max().alias("END_DATE"),
])

# Step 2: Calculate Expected Days (Device-Level)
date_range = date_range.with_columns(
    (pl.col("END_DATE") - pl.col("START_DATE")).dt.total_days().alias("EXPECTED_DAYS")
)

# Add 1 to expected days to include both start and end date
date_range = date_range.with_columns(
    (pl.col("EXPECTED_DAYS") + 1).alias("EXPECTED_DAYS")
)

# Step 3: Filter out rows with null values in 'CUMULATIVE_CONSUMPTION'
filtered_df = result_df.filter(pl.col("CUMMULATIVE_CONSUMPTION").is_not_null())

# Step 4: Count Reported Days (Device-Level) after removing nulls
reported_days = filtered_df.group_by("DEVICE_ID").agg(
    pl.col("DATE").n_unique().alias("REPORTED_DAYS")  # Use n_unique to count unique dates
)

# Step 5: Join Expected and Reported Days
completeness_df = date_range.join(reported_days, on="DEVICE_ID")

# Calculate Completeness Metric
completeness_df = completeness_df.with_columns(
    (pl.col("REPORTED_DAYS") / pl.col("EXPECTED_DAYS") * 100).alias("COMPLETENESS_PERCENTAGE")
)

# Display result
completeness_df.head()

In [None]:
completeness_df['DEVICE_ID'].n_unique()

In [None]:
completeness_df.sort('COMPLETENESS_PERCENTAGE', descending=False).head(10)

In [None]:
completeness_df.sort('REPORTED_DAYS', descending=False).head(10)

#### Visualization

In [None]:

# Step 1: Sort by Completeness and Select Top 30 Devices
top_devices = completeness_df.sort("COMPLETENESS_PERCENTAGE").head(30)["DEVICE_ID"].to_list()

# Step 2: Filter Data for Top Devices
top_device_data = result_df.filter(pl.col("DEVICE_ID").is_in(top_devices))

# Step 3: Prepare Data for Heatmap
heatmap_data = top_device_data.select(["DEVICE_ID", "DATE", "CUMMULATIVE_CONSUMPTION"]).to_pandas()

# Ensure the DATE column is in datetime format for proper sorting
heatmap_data["DATE"] = pd.to_datetime(heatmap_data["DATE"])

# Pivot for Heatmap (Devices as Rows, Dates as Columns)
heatmap_pivot = heatmap_data.pivot(index="DEVICE_ID", columns="DATE", values="CUMMULATIVE_CONSUMPTION")

# Step 4: Plot the Heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(
    heatmap_pivot.isna(),  # Highlight missing data (NaN)
    cmap="Reds",  # Missing data in shades of red
    cbar=False,
    linewidths=0.5,
    linecolor="gray"
)

plt.title("Missing Data Heatmap (Top 10 Devices by Poor Completeness)", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Device ID", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.savefig(f"{output_dir}/completeness_heatmap.png", dpi=300, bbox_inches="tight")
plt.show()


## Dimensions Scoring

In [None]:
result_df.head()

### Accuracy

Compute accuracy for each device and the whole dataset

In [None]:
# Step 1: Compute Accuracy Per Device
accuracy_df = result_df.group_by("DEVICE_ID").agg([
    pl.sum("ACCURACY").alias("ACCURATE_RECORDS"),
    pl.count().alias("TOTAL_RECORDS"),
])

# Step 2: Compute Accuracy Percentage
accuracy_df = accuracy_df.with_columns(
    ((pl.col("ACCURATE_RECORDS") / pl.col("TOTAL_RECORDS")) * 100).alias("ACCURACY_PERCENT")
)

# Step 3: Compute Overall Dataset Accuracy
overall_accuracy = (result_df["ACCURACY"].sum() / result_df.height) * 100

print(f"Overall Dataset Accuracy: {overall_accuracy:.2f}%")

In [None]:
accuracy_df.sort('ACCURACY_PERCENT', descending=False).head(10)

#### Visualization and exports

In [None]:
# Export csv
accuracy_df.write_csv('../exports/accuracy_dqs.csv')

In [None]:
accuracy_df_pd = accuracy_df.to_pandas()
# Ensure the output directory exists
output_dir = "../visualizations/plots"
os.makedirs(output_dir, exist_ok=True)

# Plot 1: Bar Chart of Accuracy Scores per Device
plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="ACCURACY_PERCENT", data=accuracy_df)
plt.xticks([], [])  # Hide x-axis labels for readability
plt.xlabel("Devices")
plt.ylabel("Accuracy Score (%)")
plt.title("Accuracy Score per Device (Abyei Camp)")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/accuracy_per_device.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 2: Histogram of Accuracy Scores
plt.figure(figsize=(10, 6))
sns.histplot(accuracy_df_pd["ACCURACY_PERCENT"], bins=20, kde=True, color="blue")
plt.xlabel("Accuracy Score (%)")
plt.ylabel("Number of Devices")
plt.title("Distribution of Device Accuracy Scores")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/accuracy_histogram.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Plot 5: Top & Bottom 30 Devices by Accuracy Score
df_sorted = accuracy_df_pd.sort_values("ACCURACY_PERCENT", ascending=True)

plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="ACCURACY_PERCENT", data=df_sorted.head(30), color="red", label="Bottom 30 Devices")
sns.barplot(x="DEVICE_ID", y="ACCURACY_PERCENT", data=df_sorted.tail(30), color="green", label="Top 30 Devices")
plt.xticks(rotation=90)
plt.xlabel("Device ID")
plt.ylabel("Accuracy Score (%)")
plt.title("Top & Bottom 50 Devices by Accuracy Score")
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/accuracy_top_bottom.png", dpi=300, bbox_inches="tight")
plt.show()

### Validity

In [None]:
# Step 2: Calculate Validity Per Device
validity_per_device = result_df.group_by("DEVICE_ID").agg([
    pl.sum("VALIDITY").alias("VALID_RECORDS"),
    pl.count().alias("TOTAL_RECORDS")
])

# Step 3: Calculate Validity Percentage
validity_per_device = validity_per_device.with_columns(
    ((pl.col("VALID_RECORDS") / pl.col("TOTAL_RECORDS")) * 100).alias("VALIDITY_PERCENT")
)

# Step 4: Compute Overall Validity (Entire Dataset)
overall_validity = (result_df["VALIDITY"].sum() / result_df.height) * 100

overall_validity

In [None]:
validity_per_device.sort('VALIDITY_PERCENT',descending=False).head()

#### Visualization and exports

In [None]:
# Assuming your dataframe is named `validity_per_device` and contains 'DEVICE_ID' and 'VALIDITY_PERCENT' columns
validity_scores = validity_per_device.to_pandas()
validity_df = validity_scores.copy()

# Plot 1: Bar Chart of Validity Scores per Device
plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="VALIDITY_PERCENT", data=validity_df)
plt.xticks([], [])  # Hide x-axis labels for readability
plt.xlabel("Devices")
plt.ylabel("Validity Score (%)")
plt.title("Validity Score per Device (Abyei Camp)")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/validity_per_device.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 2: Histogram of Validity Scores
plt.figure(figsize=(10, 6))
sns.histplot(validity_df["VALIDITY_PERCENT"], bins=20, kde=True, color="blue")
plt.xlabel("Validity Score (%)")
plt.ylabel("Number of Devices")
plt.title("Distribution of Device Validity Scores")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/validity_histogram.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 3: Boxplot of Validity Scores
plt.figure(figsize=(8, 5))
sns.boxplot(x=validity_df["VALIDITY_PERCENT"], color="cyan")
plt.xlabel("Validity Score (%)")
plt.title("Boxplot of Device Validity Scores")
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/validity_boxplot.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Plot 4: Scatter Plot - Validity Score vs. Total Records
plt.figure(figsize=(10, 6))
sns.scatterplot(x="TOTAL_RECORDS", y="VALIDITY_PERCENT", data=validity_df, alpha=0.7, color="purple")
plt.xlabel("Total Records per Device")
plt.ylabel("Validity Score (%)")
plt.title("Scatter Plot: Validity Score vs. Total Records")
plt.grid(True, linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/validity_scatter.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 5: Top & Bottom 30 Devices by Validity Score
df_sorted = validity_df.sort_values("VALIDITY_PERCENT", ascending=True)

plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="VALIDITY_PERCENT", data=df_sorted.head(30), color="red", label="Bottom 30 Devices")
sns.barplot(x="DEVICE_ID", y="VALIDITY_PERCENT", data=df_sorted.tail(30), color="green", label="Top 30 Devices")
plt.xticks(rotation=90)
plt.xlabel("Device ID")
plt.ylabel("Validity Score (%)")
plt.title("Top & Bottom 50 Devices by Validity Score")
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/validity_top_bottom.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:

# Plot 6: Distribution of Devices by Validity Score Ranges
bins = [0, 50, 70, 85, 95, 100]
labels = ["<50%", "50-69%", "70-84%", "85-94%", "95-100%"]
validity_df["VALIDITY_BIN"] = pd.cut(validity_df["VALIDITY_PERCENT"], bins=bins, labels=labels, right=True)

plt.figure(figsize=(10, 6))
sns.countplot(x="VALIDITY_BIN", data=validity_df, palette="Blues_r")
plt.xlabel("Validity Score Ranges")
plt.ylabel("Number of Devices")
plt.title("Distribution of Devices by Validity Score Range")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/validity_distribution.png", dpi=300, bbox_inches="tight")
plt.show()


### Timeliness

In [None]:
result_df.head()

In [None]:
# Aggregate Timeliness Per Device
timeliness_per_device = result_df.group_by("DEVICE_ID").agg([
    (pl.mean("TIMELINESS")*100).alias("AVG_TIMELINESS"),
    pl.count().alias("TOTAL_RECORDS")
])

# Calculate Overall Timeliness for Entire Dataset
overall_timeliness = result_df["TIMELINESS"].mean()

print(f"Overall Dataset Timeliness: {overall_timeliness:.2%}")

In [None]:
timeliness_per_device.sort('AVG_TIMELINESS',descending=False).head()

#### Visualization and exports

In [None]:
# Assuming your dataframe is named `timeliness_per_device` and contains 'DEVICE_ID' and 'AVG_TIMELINESS' columns
timeliness_scores = timeliness_per_device.to_pandas()
timeliness_df = timeliness_scores.copy()

# Plot 1: Bar Chart of Timeliness Scores per Device
plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="AVG_TIMELINESS", data=timeliness_df)
plt.xticks([], [])  # Hide x-axis labels for readability
plt.xlabel("Devices")
plt.ylabel("Timeliness Score (%)")
plt.title("Timeliness Score per Device (Abyei Camp)")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/timeliness_per_device.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 2: Histogram of Timeliness Scores
plt.figure(figsize=(10, 6))
sns.histplot(timeliness_df["AVG_TIMELINESS"], bins=20, kde=True, color="blue")
plt.xlabel("Timeliness Score (%)")
plt.ylabel("Number of Devices")
plt.title("Distribution of Device Timeliness Scores")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/timeliness_histogram.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:

# Plot 3: Scatter Plot - Timeliness Score vs. Total Records
plt.figure(figsize=(10, 6))
sns.scatterplot(x="TOTAL_RECORDS", y="AVG_TIMELINESS", data=timeliness_df, alpha=0.7, color="purple")
plt.xlabel("Total Records per Device")
plt.ylabel("Timeliness Score (%)")
plt.title("Scatter Plot: Timeliness Score vs. Total Records")
plt.grid(True, linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/timeliness_scatter.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Plot 4: Top & Bottom 30 Devices by Timeliness Score
df_sorted = timeliness_df.sort_values("AVG_TIMELINESS", ascending=True)

plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="AVG_TIMELINESS", data=df_sorted.head(30), color="red", label="Bottom 30 Devices")
sns.barplot(x="DEVICE_ID", y="AVG_TIMELINESS", data=df_sorted.tail(30), color="green", label="Top 30 Devices")
plt.xticks(rotation=90)
plt.xlabel("Device ID")
plt.ylabel("Timeliness Score (%)")
plt.title("Top & Bottom 50 Devices by Timeliness Score")
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/timeliness_top_bottom.png", dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Plot 5: Distribution of Devices by Timeliness Score Ranges
bins = [0, 50, 70, 85, 95, 100]
labels = ["<50%", "50-69%", "70-84%", "85-94%", "95-100%"]
timeliness_df["TIMELINESS_BIN"] = pd.cut(timeliness_df["AVG_TIMELINESS"], bins=bins, labels=labels, right=True)

plt.figure(figsize=(10, 6))
sns.countplot(x="TIMELINESS_BIN", data=timeliness_df, palette="Blues_r")
plt.xlabel("Timeliness Score Ranges")
plt.ylabel("Number of Devices")
plt.title("Distribution of Devices by Timeliness Score Range")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/timeliness_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

### Completeness

In [None]:
completeness_df.head()

In [None]:
# Step 1: Calculate Total Valid Reported and Expected Days
total_valid_reported = completeness_df["REPORTED_DAYS"].sum()
total_expected = completeness_df["EXPECTED_DAYS"].sum()

# Step 2: Calculate Overall Completeness
overall_completeness = (total_valid_reported / total_expected) * 100

# Display Results
print(f"Overall Dataset Completeness: {overall_completeness:.2f}%")

#### Visualization and exports

In [None]:
# Pie Chart to Visualize Completeness
labels = ['Complete Data', 'Incomplete Data']
values = [total_valid_reported, total_expected - total_valid_reported]
colors = ['#4CAF50', '#F44336']  # Green and Red

plt.figure(figsize=(8, 8))
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, wedgeprops={'edgecolor': 'white'})
plt.title("Overall Dataset Completeness")
plt.show()


#### Visualization and exports

In [None]:
# Plot 1: Bar Chart of Completeness Scores per Device
plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="COMPLETENESS_PERCENTAGE", data=completeness_df)
plt.xticks([], [])  # Hide x-axis labels for readability
plt.xlabel("Devices")
plt.ylabel("Completeness Score (%)")
plt.title("Completeness Score per Device (Abyei Camp)")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/completeness_per_device.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 2: Histogram of Completeness Scores
plt.figure(figsize=(10, 6))
sns.histplot(completeness_df["COMPLETENESS_PERCENTAGE"], bins=20, kde=True, color="blue")
plt.xlabel("Completeness Score (%)")
plt.ylabel("Number of Devices")
plt.title("Distribution of Device Completeness Scores")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/completeness_histogram.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 3: Scatter Plot - Completeness Score vs. Total Reported Days
plt.figure(figsize=(10, 6))
sns.scatterplot(x="REPORTED_DAYS", y="COMPLETENESS_PERCENTAGE", data=completeness_df, alpha=0.7, color="purple")
plt.xlabel("Total Reported Days per Device")
plt.ylabel("Completeness Score (%)")
plt.title("Scatter Plot: Completeness Score vs. Reported Days")
plt.grid(True, linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/completeness_scatter.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
completeness_df.head()

In [None]:
# Plot 4: Top & Bottom 30 Devices by Completeness Score
completeness_df_pd = completeness_df.to_pandas()

df_sorted = completeness_df_pd.sort_values("COMPLETENESS_PERCENTAGE", ascending=True)

plt.figure(figsize=(15, 6))
sns.barplot(x="DEVICE_ID", y="COMPLETENESS_PERCENTAGE", data=df_sorted.head(30), color="red", label="Bottom 30 Devices")
sns.barplot(x="DEVICE_ID", y="COMPLETENESS_PERCENTAGE", data=df_sorted.tail(30), color="green", label="Top 30 Devices")
plt.xticks(rotation=90)
plt.xlabel("Device ID")
plt.ylabel("Completeness Score (%)")
plt.title("Top & Bottom 50 Devices by Completeness Score")
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/completeness_top_bottom.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Plot 5: Distribution of Devices by Completeness Score Ranges
bins = [0, 50, 70, 85, 95, 100]
labels = ["<50%", "50-69%", "70-84%", "85-94%", "95-100%"]
completeness_df_pd["COMPLETENESS_BIN"] = pd.cut(completeness_df_pd["COMPLETENESS_PERCENTAGE"], bins=bins, labels=labels, right=True)

plt.figure(figsize=(10, 6))
sns.countplot(x="COMPLETENESS_BIN", data=completeness_df_pd, palette="Blues_r")
plt.xlabel("Completeness Score Ranges")
plt.ylabel("Number of Devices")
plt.title("Distribution of Devices by Completeness Score Range")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(f"{output_dir}/completeness_distribution.png", dpi=300, bbox_inches="tight")
plt.show()

## Overall Data Quality Score

In [None]:
# Step 1: Combine All Data Quality Metrics
metrics_df = completeness_df.select([
    "DEVICE_ID",
    "COMPLETENESS_PERCENTAGE",
    "REPORTED_DAYS"
]).join(
    accuracy_df.select(["DEVICE_ID", "ACCURACY_PERCENT"]),
    on="DEVICE_ID"
).join(
    timeliness_per_device.select(["DEVICE_ID", "AVG_TIMELINESS"]),
    on="DEVICE_ID"
).join(
    validity_per_device.select(["DEVICE_ID", "VALIDITY_PERCENT"]),
    on="DEVICE_ID"
)
metrics_df.head()

In [None]:
# Step 2: Sort by Each Metric to Identify Top and Bottom Devices
top_devices = metrics_df.sort([
    "ACCURACY_PERCENT",
    "COMPLETENESS_PERCENTAGE",
    "AVG_TIMELINESS",
    "VALIDITY_PERCENT"
], descending=True)

bottom_devices = metrics_df.sort([
    "COMPLETENESS_PERCENTAGE",
], descending=False)

In [None]:
top_devices.head()

In [None]:
bottom_devices.head()

### Weighting

#### Equal Weighting

In [None]:
# Compute Initial DQS
metrics_df = metrics_df.with_columns(
    (
        0.25 * pl.col("COMPLETENESS_PERCENTAGE") +
        0.25 * pl.col("ACCURACY_PERCENT") +
        0.25 * pl.col("AVG_TIMELINESS") +
        0.25 * pl.col("VALIDITY_PERCENT")
    ).alias("DQS_EQUAL")
)
metrics_df.sort('DQS_EQUAL',descending=False).head()

#### AHP

In [None]:
file_path = "../data/AHP Voting for Data Quality Score (Responses) - Form responses .csv"
ahp_response = pd.read_csv(file_path)

In [None]:
ahp_response.head()

The first response is not used because it's incomplete. So only the second and third are used to avoid bais.

The RI values were derived from simulations of randomly generated matrices, and they vary based on the number of criteria (
𝑛
n):

n (Number of Criteria) - RI Value
- 1	0
- 2	0
- 3	0.58
- 4	0.90
- 5	1.12
- 6	1.24
- 7	1.32
- 8	1.41
- 9	1.45
- 10	1.49


In our case, since we have 4 criteria (Accuracy, Validity, Completeness, Timeliness), we use RI = 0.90.

In [None]:
# Filter the dataset to use only the last two complete responses
df_filtered = ahp_response.iloc[-2:, :]

criteria = [
    "Accuracy",
    "Validity",
    "Completeness",
    "Timeliness"
]

n = len(criteria)
random_index = 0.90

# Extract the pairwise comparisons again with the filtered data
comparisons_filtered = {
    ("Accuracy", "Validity"): df_filtered.iloc[:, 2].astype(float),
    ("Accuracy", "Completeness"): df_filtered.iloc[:, 4].astype(float),
    ("Validity", "Completeness"): df_filtered.iloc[:, 6].astype(float),
    ("Validity", "Timeliness"): df_filtered.iloc[:, 8].astype(float),
    ("Accuracy", "Timeliness"): df_filtered.iloc[:, 10].astype(float),
    ("Completeness", "Timeliness"): df_filtered.iloc[:, 12].astype(float),
}

# Compute the mean for the filtered responses
pairwise_means_filtered = {key: values.mean() for key, values in comparisons_filtered.items()}

# Construct the pairwise comparison matrix
matrix_filtered = np.ones((n, n))
for (c1, c2), mean_value in pairwise_means_filtered.items():
    i, j = criteria.index(c1), criteria.index(c2)
    matrix_filtered[i, j] = mean_value
    matrix_filtered[j, i] = 1 / mean_value

# Compute the priority vector 
eigenvalues_filtered, eigenvectors_filtered = np.linalg.eig(matrix_filtered)
max_eigenvalue_filtered = np.max(eigenvalues_filtered)
priority_vector_filtered = np.real(eigenvectors_filtered[:, np.argmax(eigenvalues_filtered)])
priority_vector_filtered /= priority_vector_filtered.sum()

# Compute the consistency ratio 
consistency_index_filtered = (max_eigenvalue_filtered - n) / (n - 1)

consistency_ratio_filtered = consistency_index_filtered / random_index

# Prepare results for display
ahp_results = pd.DataFrame({
    "Criteria": criteria,
    "Priority Weight": priority_vector_filtered
}).sort_values(by="Priority Weight", ascending=False)

ahp_results.head()

In [None]:
consistency_ratio_filtered

In [None]:
ahp_weights = {
    "timeliness": 0.054102,
    "validity": 0.310792,
    "completeness": 0.317553,
    "accuracy": 0.317553
}

metrics_df = metrics_df.with_columns(
    (
        ahp_weights["accuracy"] * pl.col("ACCURACY_PERCENT") +
        ahp_weights["completeness"] * pl.col("COMPLETENESS_PERCENTAGE") +
        ahp_weights["timeliness"] * pl.col("AVG_TIMELINESS") +
        ahp_weights["validity"] * pl.col("VALIDITY_PERCENT")
    ).alias("DQS_AHP")
)

In [None]:
metrics_df.sort('DQS_AHP',descending=False).head()

### Final Score

In [None]:
# Step 1: Define Combination Weights (e.g., 50% for both)
equal_weight = 0.5
ahp_weight = 0.5

# Step 2: Compute Final DQS for Each Device
metrics_df = metrics_df.with_columns(
    (
        (pl.col("DQS_EQUAL") * equal_weight) +
        (pl.col("DQS_AHP") * ahp_weight)
    ).alias("DQS_FINAL_DEVICE")
)

In [None]:
metrics_df.sort('COMPLETENESS_PERCENTAGE',descending=False).head()

In [None]:
# Final DQS
metrics_df['DQS_FINAL_DEVICE'].mean()

## Devices to Enhance

In [None]:
filtered_devices = metrics_df.filter((pl.col('ACCURACY_PERCENT') < 95) | (pl.col('COMPLETENESS_PERCENTAGE') < 95))

In [None]:
filtered_devices.write_csv('../exports/devices_to_clean.csv')

In [None]:
filtered_devices.head()

In [None]:
filtered_devices.shape[0]