In [5]:
import os
import random
import time
from datetime import datetime
import xml.etree.ElementTree as ET
from xml.dom import minidom

import pandas as pd
from sklearn.linear_model import LinearRegression

# Create output directory
output_dir = "sensor_data/xml"
os.makedirs(output_dir, exist_ok=True)


In [6]:
# Function to generate random air quality data
def generate_air_quality_data(sensor_id=3):
    return {
        "sensor_id": sensor_id,
        "timestamp": datetime.now().isoformat(),
        "pm2_5": round(random.uniform(0, 250), 2),
        "pm10": round(random.uniform(0, 500), 2),
        "so2": round(random.uniform(0, 300), 2),
        "no2": round(random.uniform(0, 500), 2) if random.random() > 0.2 else None
    }

# Function to save data as XML
def save_air_quality_xml(filename, data):
    root = ET.Element("AirQualityReadings")
    for reading in data:
        reading_elem = ET.SubElement(root, "Reading")
        for key, value in reading.items():
            elem = ET.SubElement(reading_elem, key)
            elem.text = str(value)
    # Convert to string and pretty-print
    rough_string = ET.tostring(root, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    pretty_xml = reparsed.toprettyxml(indent="  ")

    # Save to file
    filepath = os.path.join(output_dir, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(pretty_xml)

    print(f"Saved Air Quality XML: {filepath}")


In [8]:
# User input for number of times to run
num_iterations = int(input("Enter number of air quality readings to generate: "))

# Generate multiple batches of air quality data
readings = []
for i in range(num_iterations):
    air_quality_data = generate_air_quality_data()
    print(air_quality_data)
    readings.append(air_quality_data)
    # time.sleep(1)  # Simulate real-time data generation

# Save all air quality data as a single XML file
save_air_quality_xml("air_quality_readings.xml", readings)


Enter number of air quality readings to generate:  50


{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810146', 'pm2_5': 198.53, 'pm10': 90.65, 'so2': 1.51, 'no2': None}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810448', 'pm2_5': 110.13, 'pm10': 167.37, 'so2': 7.56, 'no2': 304.87}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810532', 'pm2_5': 66.41, 'pm10': 369.22, 'so2': 59.97, 'no2': 261.45}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810588', 'pm2_5': 61.71, 'pm10': 453.01, 'so2': 54.19, 'no2': None}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810635', 'pm2_5': 13.89, 'pm10': 229.91, 'so2': 112.11, 'no2': 460.54}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810690', 'pm2_5': 29.93, 'pm10': 278.6, 'so2': 267.75, 'no2': None}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810736', 'pm2_5': 149.39, 'pm10': 485.82, 'so2': 197.94, 'no2': 137.48}
{'sensor_id': 3, 'timestamp': '2025-03-31T15:36:34.810784', 'pm2_5': 145.3, 'pm10': 156.62, 'so2': 36.38, 'no2': 123.52}
{'sensor_id': 3, 'timestamp': '2025-

In [15]:
df = pd.DataFrame(readings)

df = df.drop_duplicates()

save_air_quality_xml("air_quality_data.xml", df.to_dict(orient="records"))

Saved Air Quality XML: sensor_data/xml/air_quality_data.xml


In [19]:
def impute_missing_values(df, strategy='mean'): 
    if strategy == 'mean':
        df['no2'] = df['no2'].fillna(df['no2'].mean()).round()
    elif strategy == 'median':
        df['no2'] = df['no2'].fillna(df['no2'].median()).round()
    elif strategy == 'mode':
        df['no2'] = df['no2'].fillna(df['no2'].mode()).round()
    elif strategy == 'regression':
        hum_train = df.dropna(subset=['no2'])
        model = LinearRegression()
        model.fit(hum_train[['temperature']], hum_train['no2'])
        hum_missing = df[df['no2'].isnull()]
        df.loc[df['no2'].isnull(), 'no2'] = model.predict(hum_missing[['temperature']])
    return df

In [21]:
df = impute_missing_values(df, strategy='median')
save_air_quality_xml("air_quality_data_imputed.xml", df.to_dict(orient="records"))

Saved Air Quality XML: sensor_data/xml/air_quality_data_imputed.xml
