In [4]:
import os
import random
import time
from datetime import datetime
import xml.etree.ElementTree as ET
from xml.dom import minidom
import pandas as pd
from sklearn.linear_model import LinearRegression

# Create output directory
output_dir = "sensor_data/xml"
os.makedirs(output_dir, exist_ok=True)

def classify_health_environment(pm2_5, pm10, so2, no2):
    score = 0
    if pm2_5 > 100: score += 1
    if pm10 > 150: score += 1
    if so2 > 100: score += 1
    if no2 is not None and no2 > 200: score += 1

    if score == 0:
        return "Good"
    elif score == 1:
        return "Moderate"
    else:
        return "Unhealthy"
    
# Function to generate random air quality data
def generate_air_quality_data(sensor_id=3):
    pm2_5 = round(random.uniform(0, 250), 2)
    pm10 = round(random.uniform(0, 500), 2)
    so2 = round(random.uniform(0, 300), 2)
    no2 = round(random.uniform(0, 500), 2) if random.random() > 0.2 else None

    health_environment = classify_health_environment(pm2_5, pm10, so2, no2)

    return {
        "sensor_id": sensor_id,
        "timestamp": datetime.now().isoformat(),
        "pm2_5": pm2_5,
        "pm10": pm10,
        "so2": so2,
        "no2": no2,
        "health_environment": health_environment.lower()
    }

# Impute only no2 values using mean, median, mode or regression
def impute_missing_values(df, strategy='mean'):
    if strategy == 'mean':
        df['no2'] = df['no2'].fillna(df['no2'].mean()).round(2)
    elif strategy == 'median':
         df['no2'] = df['no2'].fillna(df['no2'].median()).round(2)
    elif strategy == 'mode':
        df['no2'] = df['no2'].fillna(df['no2'].mode()).round(2)
    elif strategy == 'regression':
        # Only impute no2 values using so2 as predictor
        hum_train = df.dropna(subset=['no2'])
        model = LinearRegression()
        model.fit(hum_train[['so2']], hum_train['no2'])
        hum_missing = df[df['no2'].isnull()]
        df.loc[df['no2'].isnull(), 'no2'] = model.predict(hum_missing[['so2']]).round(2)
    return df

# Function to save data as XML
def save_air_quality_xml(filename, data):
    root = ET.Element("AirQualityReadings")
    for reading in data:
        reading_elem = ET.SubElement(root, "Reading")
        for key, value in reading.items():
            elem = ET.SubElement(reading_elem, key)
            elem.text = str(value)
    # Convert to string and pretty-print
    rough_string = ET.tostring(root, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    pretty_xml = reparsed.toprettyxml(indent="  ")

    # Save to file
    filepath = os.path.join(output_dir, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(pretty_xml)

    print(f"Saved Air Quality XML: {filepath}")

# User input for number of times to run
num_iterations = int(input("Enter number of air quality readings to generate: "))

# Generate multiple batches of air quality data
readings = []
for i in range(num_iterations):
    air_quality_data = generate_air_quality_data()
    print(air_quality_data)
    readings.append(air_quality_data)
    # time.sleep(1)  # Simulate real-time data generation

    # Add a duplicate at every 3rd reading
    if i % 3 == 0 and i != 0:
        readings.append(air_quality_data)
        print(air_quality_data)

# Create DataFrame
df = pd.DataFrame(readings)

# Save raw data (with duplicates)
save_air_quality_xml("air_quality_readings_errors.xml", df.to_dict(orient='records'))

# Remove duplicated rows
df = df.drop_duplicates()

# Impute missing no2 values 
df = impute_missing_values(df, strategy="mean")

# Save cleaned data (without duplicates)
save_air_quality_xml("air_quality_readings.xml", df.to_dict(orient='records'))

df.to_csv("air_quality_reading.csv", index=False)

def save_motion_json(filename, data):
    filepath = os.path.join(output_dir, filename)
    with open(filepath, "w") as file:
        json.dump(data, file, indent=4)
        file.write("\n")
    print(f"Saved Motion Detection JSON: {filepath}")

save_motion_json("sensor_data/json/data.json", df)


Enter number of air quality readings to generate:  1


{'sensor_id': 3, 'timestamp': '2025-03-31T15:59:44.159907', 'pm2_5': 97.57, 'pm10': 2.56, 'so2': 155.0, 'no2': 473.21, 'health_environment': 'unhealthy'}
Saved Air Quality XML: sensor_data/xml/air_quality_readings_errors.xml
Saved Air Quality XML: sensor_data/xml/air_quality_readings.xml


FileNotFoundError: [Errno 2] No such file or directory: 'sensor_data/xml/sensor_data/json/data.json'