In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Simulation settings
num_homes = 30
devices_per_home = [random.randint(3, 5) for _ in range(num_homes)]
total_devices = sum(devices_per_home)
start_time = datetime(2025, 4, 10, 0, 0)
end_time = datetime(2025, 4, 13, 0, 0)
interval = timedelta(minutes=10)

# Generate timestamps
timestamps = []
current = start_time
while current <= end_time:
    timestamps.append(current)
    current += interval

# Device IDs
device_ids = []
for home_id, count in enumerate(devices_per_home):
    for i in range(count):
        device_ids.append((home_id + 1, f"device_{i+1}"))

# Helper function to simulate values
def simulate_row(home_id, device_id, ts):
    # Random walk for signal
    base_signal = np.random.normal(loc=-60, scale=10)
    latency = np.clip(np.random.normal(loc=50, scale=20), 5, 300)
    jitter = np.clip(np.random.normal(loc=5, scale=5), 0, 100)
    packet_loss = np.random.choice([0, 0.01, 0.05, 0.1], p=[0.85, 0.1, 0.03, 0.02])
    is_connected = np.random.choice([1, 0], p=[0.97, 0.03])

    # Experience score (higher = better)
    score = max(0, min(1, 1 - (abs(base_signal + 50)/50 + latency/300 + jitter/100 + packet_loss*10)/4))

    # Simulate outage (rare)
    predicted_outage = 1 if (score < 0.3 and is_connected == 0) else 0

    return [home_id, device_id, ts, base_signal, latency, jitter, packet_loss, is_connected, score, predicted_outage]

# Build dataset
rows = []
for home_id, device_id in device_ids:
    for ts in timestamps:
        rows.append(simulate_row(home_id, device_id, ts))

# Create DataFrame
columns = ["home_id", "device_id", "timestamp", "rssi", "latency_ms", "jitter_ms", "packet_loss", "is_connected", "experience_score", "predicted_outage"]
df = pd.DataFrame(rows, columns=columns)

# Save first preview
df.to_csv("netwatch_wifi_data.csv", index=False)
df.head()


Unnamed: 0,home_id,device_id,timestamp,rssi,latency_ms,jitter_ms,packet_loss,is_connected,experience_score,predicted_outage
0,1,device_1,2025-04-10 00:00:00,-46.377657,59.705245,3.71331,0.0,0,0.922851,0
1,1,device_1,2025-04-10 00:10:00,-60.151688,78.831453,9.835976,0.0,1,0.858959,0
2,1,device_1,2025-04-10 00:20:00,-63.716893,79.738369,10.73846,0.0,1,0.838121,0
3,1,device_1,2025-04-10 00:30:00,-58.632815,27.618233,2.829852,0.0,1,0.926746,0
4,1,device_1,2025-04-10 00:40:00,-68.19061,6.676424,3.032769,0.05,1,0.770901,0


In [2]:
df_dirty = df.copy()

# 1. Inject missing values
for col in ["rssi", "latency_ms", "jitter_ms"]:
    missing_indices = df_dirty.sample(frac=0.03, random_state=42).index
    df_dirty.loc[missing_indices, col] = np.nan

# 2. Simulate data corruption: jitter too high
jitter_spikes = df_dirty.sample(frac=0.01, random_state=43).index
df_dirty.loc[jitter_spikes, "jitter_ms"] = df_dirty["jitter_ms"].max() * 3

# 3. Duplicate a few rows
dupes = df_dirty.sample(frac=0.01, random_state=44)
df_dirty = pd.concat([df_dirty, dupes], ignore_index=True)

# 4. Timestamp gaps (drop random time windows)
gap_indices = df_dirty.sample(frac=0.02, random_state=45).index
df_dirty = df_dirty.drop(index=gap_indices)

# 5. Device disconnect anomaly: all values normal, but marked as disconnected
weird_disconnects = df_dirty.sample(frac=0.01, random_state=46).index
df_dirty.loc[weird_disconnects, ["rssi", "latency_ms", "jitter_ms", "packet_loss"]] = df_dirty.loc[weird_disconnects, ["rssi", "latency_ms", "jitter_ms", "packet_loss"]].apply(lambda x: x.mean())
df_dirty.loc[weird_disconnects, "is_connected"] = 0

# Save dirty version
df_dirty.to_csv("netwatch_wifi_data_dirty.csv", index=False)
df_dirty.head()


Unnamed: 0,home_id,device_id,timestamp,rssi,latency_ms,jitter_ms,packet_loss,is_connected,experience_score,predicted_outage
0,1,device_1,2025-04-10 00:00:00,-46.377657,59.705245,3.71331,0.0,0,0.922851,0
1,1,device_1,2025-04-10 00:10:00,-60.151688,78.831453,9.835976,0.0,1,0.858959,0
2,1,device_1,2025-04-10 00:20:00,-63.716893,79.738369,10.73846,0.0,1,0.838121,0
3,1,device_1,2025-04-10 00:30:00,-58.632815,27.618233,2.829852,0.0,1,0.926746,0
4,1,device_1,2025-04-10 00:40:00,,,,0.05,1,0.770901,0
