In [1]:
import os
import pandas as pd

## Clean Data

In [9]:
# Directory containing sensor data files
data_dir = 'data/Indoor_raw'
output_dir = 'data/filtered_raw' 

# Create the filtered directory if it doesn’t exist
os.makedirs(output_dir, exist_ok=True)

# Define the date ranges for each sensor based on the data provided
sensor_time_frames = {
    'Indoor01.csv': ('2024-02-14', '2024-04-16'), 
    'Indoor02.csv': ('2024-02-20', '2024-07-23'),
    'Indoor03.csv': ('2024-02-27', '2024-07-27'),
    'Indoor04.csv': ('2024-03-04', '2024-07-14'),
    'Indoor05.csv': ('2024-03-08', '2024-08-26'),
    'Indoor06.csv': ('2024-03-23', '2024-09-06'),
    'Indoor07.csv': ('2024-04-04', '2024-08-11'),
    'Indoor08.csv': ('2024-04-23', '2024-08-20'),
    'Indoor09.csv': ('2024-04-25', '2024-08-31'),
    'Indoor10.csv': ('2024-04-30', '2024-08-30'),
    'Indoor11.csv': ('2024-05-04', '2024-09-07'),
    'Indoor12.csv': ('2024-05-04', '2024-09-15'),
    'Indoor13.csv': ('2024-05-14', '2024-08-09'),  
    'Indoor14.csv': ('2024-05-25', '2024-10-09'),
    'Indoor15.csv': ('2024-07-14', '2024-11-27') 
}

In [12]:
# Define the columns that need to be converted to numeric
numeric_columns = [
    'PM1.0', 'PM2.5', 'PM4.0', 'PM10',
    'PM0.5 NC', 'PM1.0 NC', 'PM2.5 NC', 'PM4.0 NC', 'PM10 NC',
    'CO2', 'Barometric Pressure', 'VOC tVOC measurement',
    'Temperature', 'Relative Humidity'
]

# Drop unnecessary columns (like calibration settings & metadata) if needed
drop_columns = [
    'Device ID', 'Serial Number', 'Model', 'Sub Model', 'Friendly Name',
    'Is Indoor', 'Is Public', 'System Status'
]

# Function to clean each sensor DataFrame
def clean_sensor_dataframe(sensor_df):
    """
    Cleans the sensor DataFrame:
    - Converts timestamps
    - Creates 'Timestamp_Local' (+3 hours)
    - Converts numeric columns
    - Drops unnecessary metadata
    """
    sensor_df = sensor_df.iloc[1:]  # Remove the second header row
    sensor_df = sensor_df.sort_values(by=['Timestamp']).reset_index(drop=True)  # Sort timestamps
    sensor_df['Timestamp'] = pd.to_datetime(sensor_df['Timestamp'], errors='coerce')  # Convert to datetime
    sensor_df['Timestamp_Local'] = sensor_df['Timestamp'] + pd.Timedelta(hours=3)  # Add 3 hours
    sensor_df[numeric_columns] = sensor_df[numeric_columns].apply(pd.to_numeric, errors='coerce')  # Convert to numeric
    sensor_df = sensor_df.drop(columns=drop_columns, errors='ignore')  # Drop unneeded columns
    return sensor_df  # ✅ RETURN the cleaned DataFrame

# Loop over each sensor file in the data directory
for file_name in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file_name)
    
    # Check if the file is a CSV and in the sensor time frames dictionary
    if file_name.endswith('.csv') and file_name in sensor_time_frames:
        print(f"Processing: {file_name}")

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Clean the sensor DataFrame
        df = clean_sensor_dataframe(df)  

        if df is not None:  # Ensure DataFrame is valid before filtering
            # Get the start and end date for this sensor
            start_date, end_date = sensor_time_frames[file_name]
            start_date = pd.to_datetime(start_date)

            # Apply date filtering based on the defined range
            if end_date:
                end_date = pd.to_datetime(end_date)
                filtered_df = df[(df['Timestamp_Local'] >= start_date) & (df['Timestamp_Local'] <= end_date)]
            else:
                filtered_df = df[df['Timestamp_Local'] >= start_date]

            # Save the cleaned and filtered data to the output directory
            output_file_path = os.path.join(output_dir, file_name)
            filtered_df.to_csv(output_file_path, index=False)
            print(f"Saved: {output_file_path}")

Processing: Indoor14.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor14.csv
Processing: Indoor01.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor01.csv
Processing: Indoor15.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor15.csv
Processing: Indoor03.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor03.csv
Processing: Indoor02.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor02.csv
Processing: Indoor06.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor06.csv
Processing: Indoor12.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor12.csv
Processing: Indoor13.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor13.csv
Processing: Indoor07.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor07.csv
Processing: Indoor11.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor11.csv
Processing: Indoor05.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor05.csv
Processing: Indoor04.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor04.csv
Processing: Indoor10.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor10.csv
Processing: Indoor09.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor09.csv
Processing: Indoor08.csv


  df = pd.read_csv(file_path)


Saved: data/filtered_raw/Indoor08.csv


## Harmonization

In [15]:
# Directory containing filtered sensor data files
filtered_dir = 'data/filtered_raw'  # Directory with cleaned and filtered data
harmonized_dir = 'data/harmonized'  # Directory to save harmonized data
parameters_dir = "parameters"  # Directory containing individual parameter files

# Ensure the harmonized directory exists
os.makedirs(harmonized_dir, exist_ok=True)

# Define the columns to calibrate
working_columns = ['PM2.5', 'CO2', 'Barometric Pressure', 'VOC tVOC measurement', 'Temperature', 'Relative Humidity']

In [16]:
# Loop through each dataset in the filtered directory
for file_name in os.listdir(filtered_dir):
    file_path = os.path.join(filtered_dir, file_name)
    
    # Process only CSV files
    if file_name.endswith('.csv'):
        # Read the filtered dataset
        sensor_df = pd.read_csv(file_path)
        
        # Use the filename (without extension) as the Sensor ID
        sensor_id = file_name.replace(".csv", "")  # e.g., 'Indoor01.csv' -> 'Indoor01'
        sensor_df['Sensor'] = sensor_id  # Add the Sensor ID to the DataFrame
        
        print(f"\nProcessing file: {file_name} with Sensor ID: {sensor_id}")

        # Loop through each column in working_columns to apply calibration
        for column_name in working_columns:
            # Prepare the path to the coefficients file
            coeff_file_name = f"{column_name.lower().replace(' ', '_')}_coefficients_df.csv"
            coefficients_file = os.path.join(parameters_dir, coeff_file_name)
            
            # Check if the coefficient file exists
            if os.path.exists(coefficients_file):
                # Read the coefficients data
                coefficients_df = pd.read_csv(coefficients_file)
                
                # Rename columns dynamically to avoid conflicts
                coefficients_df.rename(columns={'Coefficient': f'{column_name}_Coefficient',
                                                'Intercept': f'{column_name}_Intercept'}, inplace=True)
                
                # Merge the sensor DataFrame with the coefficients DataFrame
                sensor_df = pd.merge(sensor_df, coefficients_df, on='Sensor', how='left')
                
                # Check if the merge was successful by inspecting NaNs in coefficient columns
                if sensor_df[f'{column_name}_Coefficient'].isnull().all():
                    print(f"Warning: All coefficients missing for '{column_name}' in sensor '{sensor_id}'")
                else:
                    # Print out the first few rows to verify calibration before applying it
                    print(f"\nBefore calibration for '{column_name}':")
                    print(sensor_df[[column_name, f'{column_name}_Coefficient', f'{column_name}_Intercept']].head())

                    # Apply calibration formula
                    sensor_df[column_name] = sensor_df[column_name] * sensor_df[f'{column_name}_Coefficient'] + sensor_df[f'{column_name}_Intercept']
                    
                    # Check the first few calibrated values
                    print(f"\nCalibrated values for '{column_name}' in '{file_name}':")
                    print(sensor_df[[column_name]].head())
                
                # Drop coefficient and intercept columns to clean up
                sensor_df.drop(columns=[f'{column_name}_Coefficient', f'{column_name}_Intercept'], inplace=True)
            else:
                print(f"Warning: No coefficients file found for {coeff_file_name}")
        
        # Save the harmonized data to the harmonized directory
        harmonized_file_path = os.path.join(harmonized_dir, file_name)
        sensor_df.to_csv(harmonized_file_path, index=False)

print("All harmonized datasets have been saved.")


Processing file: Indoor14.csv with Sensor ID: Indoor14

Before calibration for 'CO2':
   CO2  CO2_Coefficient  CO2_Intercept
0  489          0.95968      20.720851
1  482          0.95968      20.720851
2  476          0.95968      20.720851
3  468          0.95968      20.720851
4  465          0.95968      20.720851

Calibrated values for 'CO2' in 'Indoor14.csv':
          CO2
0  490.004411
1  483.286650
2  477.528569
3  469.851129
4  466.972089

Before calibration for 'Barometric Pressure':
   Barometric Pressure  Barometric Pressure_Coefficient   
0                23.65                         0.987145  \
1                23.64                         0.987145   
2                23.64                         0.987145   
3                23.63                         0.987145   
4                23.63                         0.987145   

   Barometric Pressure_Intercept  
0                       0.292873  
1                       0.292873  
2                       0.292873  
3    

## Outdoor Sensors

In [None]:
# Directories
raw_dir = '00_data/raw_data/outdoor_raw'  
cleaned_dir = '00_data/raw_data/outdoor_cleaned'  

# Ensure the cleaned directory exists
os.makedirs(cleaned_dir, exist_ok=True)

# Sensor names for mapping
sensor_names = ['outdoor01', 'outdoor02', 'outdoor03']

# Define the cleaning function for outdoor sensors
def clean_sensor_dataframes_outdoor(sensor_dataframes):
    numeric_columns = ['PM1.0', 'PM2.5', 'PM4.0', 'PM10',
                       'NC0.5', 'NC1.0', 'NC2.5', 'NC10',
                       'Temperature', 'Relative Humidity']
    cleaned_dataframes = []
    
    # Determine which columns are present and which are missing
    columns_not_present = list(set(numeric_columns) - set(sensor_dataframes[0].columns.to_list()))
    columns_present = [x for x in numeric_columns if x not in columns_not_present]

    print(f'Columns not present in the Dataframe: {columns_not_present}')

    # Loop through each sensor DataFrame for cleaning
    for sensor_df in sensor_dataframes:
        # Remove header
        sensor_df = sensor_df.iloc[1:]
        # Sort by 'Timestamp'
        sensor_df = sensor_df.sort_values(by=['Timestamp']).reset_index(drop=True)
        # Change 'Timestamp' to date format
        sensor_df['Timestamp'] = pd.to_datetime(sensor_df['Timestamp'], errors='coerce')
        sensor_df['Timestamp_Local'] = sensor_df['Timestamp'] + pd.Timedelta(hours=3)
        # Convert specified columns to numeric
        sensor_df[columns_present] = sensor_df[columns_present].apply(pd.to_numeric, errors='coerce')
        # Append the cleaned DataFrame to the list
        cleaned_dataframes.append(sensor_df)
    
    return cleaned_dataframes

# List to hold raw dataframes for cleaning
sensor_dataframes_outdoor = []

# Load each file in the raw outdoor directory
for file_name in os.listdir(raw_dir):
    file_path = os.path.join(raw_dir, file_name)
    if file_name.endswith('.csv'):
        # Read the raw data
        df = pd.read_csv(file_path)
        sensor_dataframes_outdoor.append(df)

# Apply cleaning function
cleaned_sensor_dataframes_outdoor = clean_sensor_dataframes_outdoor(sensor_dataframes_outdoor)

# Save each cleaned DataFrame to the cleaned directory
for i, cleaned_df in enumerate(cleaned_sensor_dataframes_outdoor):
    # Create filename based on the original sensor name
    cleaned_file_name = f"{sensor_names[i]}_cleaned.csv"
    cleaned_file_path = os.path.join(cleaned_dir, cleaned_file_name)
    cleaned_df.to_csv(cleaned_file_path, index=False)

print("All outdoor sensor datasets have been cleaned and saved.")

Columns not present in the Dataframe: ['NC2.5', 'NC10', 'NC1.0', 'NC0.5']
All outdoor sensor datasets have been cleaned and saved.


In [4]:
# Directory containing filtered sensor data files
filtered_dir = '00_data/raw_data/outdoor_cleaned'  
harmonized_dir = '00_data/calibrated_data/outdoor_cal'
parameters_dir = "02_parameters"  # Directory containing individual parameter files

# Ensure the harmonized directory exists
os.makedirs(harmonized_dir, exist_ok=True)

# Define the columns to calibrate
working_columns = ['PM2.5', 'Temperature', 'Relative Humidity']

In [11]:
# Loop through each dataset in the filtered directory
for file_name in os.listdir(filtered_dir):
    file_path = os.path.join(filtered_dir, file_name)
    
    # Process only CSV files
    if file_name.endswith('.csv'):
        # Read the filtered dataset
        sensor_df = pd.read_csv(file_path)
        
        # Use the filename (without extension) as the Sensor ID
        sensor_id = file_name.replace("_cleaned.csv", "")  
        sensor_df['Sensor'] = sensor_id  
        
        print(f"\nProcessing file: {file_name} with Sensor ID: {sensor_id}")

        # Loop through each column in working_columns to apply calibration
        for column_name in working_columns:
            # Prepare the path to the coefficients file
            coeff_file_name = f"outdoor_{column_name.lower().replace(' ', '_')}_parameters.csv"
            coefficients_file = os.path.join(parameters_dir, coeff_file_name)
            
            # Check if the coefficient file exists
            if os.path.exists(coefficients_file):
                # Read the coefficients data
                coefficients_df = pd.read_csv(coefficients_file)
                
                # Rename columns dynamically to avoid conflicts
                coefficients_df.rename(columns={'Coefficient': f'{column_name}_Coefficient',
                                                'Intercept': f'{column_name}_Intercept'}, inplace=True)
                
                # Merge the sensor DataFrame with the coefficients DataFrame
                sensor_df = pd.merge(sensor_df, coefficients_df, on='Sensor', how='left')
                
                # Check if the merge was successful by inspecting NaNs in coefficient columns
                if sensor_df[f'{column_name}_Coefficient'].isnull().all():
                    print(f"Warning: All coefficients missing for '{column_name}' in sensor '{sensor_id}'")
                else:
                    # Print out the first few rows to verify calibration before applying it
                    print(f"\nBefore calibration for '{column_name}':")
                    print(sensor_df[[column_name, f'{column_name}_Coefficient', f'{column_name}_Intercept']].head())

                    # Apply calibration formula
                    sensor_df[column_name] = sensor_df[column_name] * sensor_df[f'{column_name}_Coefficient'] + sensor_df[f'{column_name}_Intercept']
                    
                    # Check the first few calibrated values
                    print(f"\nCalibrated values for '{column_name}' in '{file_name}':")
                    print(sensor_df[[column_name]].head())
                
                # Drop coefficient and intercept columns to clean up
                sensor_df.drop(columns=[f'{column_name}_Coefficient', f'{column_name}_Intercept'], inplace=True)
            else:
                print(f"Warning: No coefficients file found for {coeff_file_name}")
        
        # Save the harmonized data to the harmonized directory
        harmonized_file_path = os.path.join(harmonized_dir, file_name)
        sensor_df.to_csv(harmonized_file_path, index=False)

print("All harmonized datasets have been saved.")


Processing file: outdoor03_cleaned.csv with Sensor ID: outdoor03

Before calibration for 'PM2.5':
   PM2.5  PM2.5_Coefficient  PM2.5_Intercept
0      1           0.690358        18.614914
1      1           0.690358        18.614914
2      1           0.690358        18.614914
3      1           0.690358        18.614914
4      1           0.690358        18.614914

Calibrated values for 'PM2.5' in 'outdoor03_cleaned.csv':
       PM2.5
0  19.305271
1  19.305271
2  19.305271
3  19.305271
4  19.305271

Before calibration for 'Temperature':
   Temperature  Temperature_Coefficient  Temperature_Intercept
0         73.0                 0.972709               1.840858
1         73.0                 0.972709               1.840858
2         73.2                 0.972709               1.840858
3         73.2                 0.972709               1.840858
4         73.2                 0.972709               1.840858

Calibrated values for 'Temperature' in 'outdoor03_cleaned.csv':
   Temperatu