## Combine Data

In [2]:
import os
import pandas as pd

# Directory containing the subfolders with sensor files
base_dir = 'data/raw'
combined_dir = 'data/combined'  # Directory to save combined sensor data

# Create the combined directory if it doesn’t exist
os.makedirs(combined_dir, exist_ok=True)

# Dictionary to hold data for each sensor
sensor_data = {}

# Traverse all folders and subfolders
for root, _, files in os.walk(base_dir):
    for file_name in files:
        # Check if the file is a CSV and matches the sensor pattern (e.g., 'Indoor01.csv')
        if file_name.endswith('.csv') and file_name.startswith('Indoor'):
            file_path = os.path.join(root, file_name)
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Add the data to the sensor's list in the dictionary
            if file_name not in sensor_data:
                sensor_data[file_name] = []  # Initialize list for the sensor if not already present
            
            sensor_data[file_name].append(df)

# Combine and save each sensor's data
for sensor, dfs in sensor_data.items():
    if dfs:  # Check if there is any data to concatenate
        combined_sensor_data = pd.concat(dfs, ignore_index=True)
        
        # Save the combined data for each sensor in the 'combined' folder
        combined_file_path = os.path.join(combined_dir, sensor)
        combined_sensor_data.to_csv(combined_file_path, index=False)

## Clean Data

In [3]:
# Directory containing sensor data files
data_dir = 'data/combined'
output_dir = 'data/filtered_raw' 

# Create the filtered directory if it doesn’t exist
os.makedirs(output_dir, exist_ok=True)

# Define the date ranges for each sensor based on the data provided
sensor_time_frames = {
    'Indoor01.csv': ('2024-02-14', '2024-04-16'), 
    'Indoor02.csv': ('2024-02-20', '2024-07-23'),
    'Indoor03.csv': ('2024-02-27', '2024-07-27'),
    'Indoor04.csv': ('2024-03-04', '2024-07-14'),
    'Indoor05.csv': ('2024-03-08', '2024-08-26'),
    'Indoor06.csv': ('2024-03-23', '2024-09-06'),
    'Indoor07.csv': ('2024-04-04', '2024-08-11'),
    'Indoor08.csv': ('2024-04-23', '2024-08-20'),
    'Indoor09.csv': ('2024-04-25', '2024-08-31'),
    'Indoor10.csv': ('2024-04-30', '2024-08-30'),
    'Indoor11.csv': ('2024-05-04', '2024-09-07'),
    'Indoor12.csv': ('2024-05-04', '2024-09-15'),
    'Indoor13.csv': ('2024-05-14', '2024-08-09'),  
    'Indoor14.csv': ('2024-05-25', '2024-10-09'),
    'Indoor15.csv': ('2024-07-14', '2024-10-09') 
}

In [None]:
# Define the columns that need to be converted to numeric
numeric_columns = ['PM 1.0', 'PM 2.5', 'PM 4.0', 'PM 10',
                   'NC 0.5', 'NC 1.0', 'NC 2.5', 'NC 10',
                   'CO2', 'Barometric Pressure', 'VOC tVOC measurement',
                   'VOC Ethanol', 'Temperature', 'Relative Humidity']

# Function to clean each sensor DataFrame
def clean_sensor_dataframe(sensor_df):
    # Remove header
    sensor_df = sensor_df.iloc[1:]
    # Sort by 'Timestamp' and reset index
    sensor_df = sensor_df.sort_values(by=['Timestamp']).reset_index(drop=True)
    # Convert 'Timestamp' column to datetime
    sensor_df['Timestamp'] = pd.to_datetime(sensor_df['Timestamp'], errors='coerce')
    # Convert specified columns to numeric, coerce errors to NaN
    sensor_df[numeric_columns] = sensor_df[numeric_columns].apply(pd.to_numeric, errors='coerce')
    return sensor_df

# Loop over each sensor file in the data directory
for file_name in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file_name)
    
    # Check if the file is a CSV and in the sensor time frames dictionary
    if file_name.endswith('.csv') and file_name in sensor_time_frames:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Clean the sensor DataFrame
        df = clean_sensor_dataframe(df)
        
        # Get the start and end date for this sensor
        start_date, end_date = sensor_time_frames[file_name]
        start_date = pd.to_datetime(start_date)
        
        # Apply date filtering based on the defined range
        if end_date:
            end_date = pd.to_datetime(end_date)
            filtered_df = df[(df['Timestamp'] >= start_date) & (df['Timestamp'] <= end_date)]
        else:
            filtered_df = df[df['Timestamp'] >= start_date]
        
        # Save the cleaned and filtered data to the output directory
        output_file_path = os.path.join(output_dir, file_name)
        filtered_df.to_csv(output_file_path, index=False)

## Harmonization

In [21]:
# Directory containing filtered sensor data files
filtered_dir = 'data/filtered_raw'  # Directory with cleaned and filtered data
harmonized_dir = 'data/harmonized'  # Directory to save harmonized data
parameters_dir = "parameters"  # Directory containing individual parameter files

# Ensure the harmonized directory exists
os.makedirs(harmonized_dir, exist_ok=True)

# Define the columns to calibrate
working_columns = ['PM 2.5', 'CO2', 'Barometric Pressure', 'VOC tVOC measurement', 'Temperature', 'Relative Humidity']

In [22]:
# Loop through each dataset in the filtered directory
for file_name in os.listdir(filtered_dir):
    file_path = os.path.join(filtered_dir, file_name)
    
    # Process only CSV files
    if file_name.endswith('.csv'):
        # Read the filtered dataset
        sensor_df = pd.read_csv(file_path)
        
        # Use the filename (without extension) as the Sensor ID
        sensor_id = file_name.replace(".csv", "")  # e.g., 'Indoor01.csv' -> 'Indoor01'
        sensor_df['Sensor'] = sensor_id  # Add the Sensor ID to the DataFrame
        
        print(f"\nProcessing file: {file_name} with Sensor ID: {sensor_id}")

        # Loop through each column in working_columns to apply calibration
        for column_name in working_columns:
            # Prepare the path to the coefficients file
            coeff_file_name = f"{column_name.lower().replace(' ', '_')}_coefficients_df.csv"
            coefficients_file = os.path.join(parameters_dir, coeff_file_name)
            
            # Check if the coefficient file exists
            if os.path.exists(coefficients_file):
                # Read the coefficients data
                coefficients_df = pd.read_csv(coefficients_file)
                
                # Rename columns dynamically to avoid conflicts
                coefficients_df.rename(columns={'Coefficient': f'{column_name}_Coefficient',
                                                'Intercept': f'{column_name}_Intercept'}, inplace=True)
                
                # Merge the sensor DataFrame with the coefficients DataFrame
                sensor_df = pd.merge(sensor_df, coefficients_df, on='Sensor', how='left')
                
                # Check if the merge was successful by inspecting NaNs in coefficient columns
                if sensor_df[f'{column_name}_Coefficient'].isnull().all():
                    print(f"Warning: All coefficients missing for '{column_name}' in sensor '{sensor_id}'")
                else:
                    # Print out the first few rows to verify calibration before applying it
                    print(f"\nBefore calibration for '{column_name}':")
                    print(sensor_df[[column_name, f'{column_name}_Coefficient', f'{column_name}_Intercept']].head())

                    # Apply calibration formula
                    sensor_df[column_name] = sensor_df[column_name] * sensor_df[f'{column_name}_Coefficient'] + sensor_df[f'{column_name}_Intercept']
                    
                    # Check the first few calibrated values
                    print(f"\nCalibrated values for '{column_name}' in '{file_name}':")
                    print(sensor_df[[column_name]].head())
                
                # Drop coefficient and intercept columns to clean up
                sensor_df.drop(columns=[f'{column_name}_Coefficient', f'{column_name}_Intercept'], inplace=True)
            else:
                print(f"Warning: No coefficients file found for {coeff_file_name}")
        
        # Save the harmonized data to the harmonized directory
        harmonized_file_path = os.path.join(harmonized_dir, file_name)
        sensor_df.to_csv(harmonized_file_path, index=False)

print("All harmonized datasets have been saved.")


Processing file: Indoor14.csv with Sensor ID: Indoor14

Before calibration for 'PM 2.5':
   PM 2.5  PM 2.5_Coefficient  PM 2.5_Intercept
0     6.0            0.949117         -0.426889
1     6.0            0.949117         -0.426889
2     5.0            0.949117         -0.426889
3     4.0            0.949117         -0.426889
4     4.0            0.949117         -0.426889

Calibrated values for 'PM 2.5' in 'Indoor14.csv':
     PM 2.5
0  5.267815
1  5.267815
2  4.318698
3  3.369580
4  3.369580

Before calibration for 'CO2':
     CO2  CO2_Coefficient  CO2_Intercept
0  441.0          0.95968      20.720851
1  446.0          0.95968      20.720851
2  446.0          0.95968      20.720851
3  446.0          0.95968      20.720851
4  437.0          0.95968      20.720851

Calibrated values for 'CO2' in 'Indoor14.csv':
          CO2
0  443.939767
1  448.738167
2  448.738167
3  448.738167
4  440.101046

Before calibration for 'Barometric Pressure':
   Barometric Pressure  Barometric Pressure

## Outdoor Sensors

In [23]:
# Directories
raw_dir = 'data/raw/outdoor'  # Directory containing the raw outdoor data files
cleaned_dir = 'data/outdoor_cleaned'  # Directory to save the cleaned outdoor data

# Ensure the cleaned directory exists
os.makedirs(cleaned_dir, exist_ok=True)

# Sensor names for mapping
sensor_names = ['outdoor01', 'outdoor02', 'outdoor03']

# Define the cleaning function for outdoor sensors
def clean_sensor_dataframes_outdoor(sensor_dataframes):
    numeric_columns = ['PM 1.0', 'PM 2.5', 'PM 4.0', 'PM 10',
                       'NC 0.5', 'NC 1.0', 'NC 2.5', 'NC 10',
                       'Temperature', 'Relative Humidity']
    cleaned_dataframes = []
    
    # Determine which columns are present and which are missing
    columns_not_present = list(set(numeric_columns) - set(sensor_dataframes[0].columns.to_list()))
    columns_present = [x for x in numeric_columns if x not in columns_not_present]

    print(f'Columns not present in the Dataframe: {columns_not_present}')

    # Loop through each sensor DataFrame for cleaning
    for sensor_df in sensor_dataframes:
        # Remove header
        sensor_df = sensor_df.iloc[1:]
        # Sort by 'Timestamp'
        sensor_df = sensor_df.sort_values(by=['Timestamp']).reset_index(drop=True)
        # Change 'Timestamp' to date format
        sensor_df['Timestamp'] = pd.to_datetime(sensor_df['Timestamp'], errors='coerce')
        # Convert specified columns to numeric
        sensor_df[columns_present] = sensor_df[columns_present].apply(pd.to_numeric, errors='coerce')
        # Append the cleaned DataFrame to the list
        cleaned_dataframes.append(sensor_df)
    
    return cleaned_dataframes

# List to hold raw dataframes for cleaning
sensor_dataframes_outdoor = []

# Load each file in the raw outdoor directory
for file_name in os.listdir(raw_dir):
    file_path = os.path.join(raw_dir, file_name)
    if file_name.endswith('.csv'):
        # Read the raw data
        df = pd.read_csv(file_path)
        sensor_dataframes_outdoor.append(df)

# Apply cleaning function
cleaned_sensor_dataframes_outdoor = clean_sensor_dataframes_outdoor(sensor_dataframes_outdoor)

# Save each cleaned DataFrame to the cleaned directory
for i, cleaned_df in enumerate(cleaned_sensor_dataframes_outdoor):
    # Create filename based on the original sensor name
    cleaned_file_name = f"{sensor_names[i]}_cleaned.csv"
    cleaned_file_path = os.path.join(cleaned_dir, cleaned_file_name)
    cleaned_df.to_csv(cleaned_file_path, index=False)

print("All outdoor sensor datasets have been cleaned and saved.")


Columns not present in the Dataframe: []
All outdoor sensor datasets have been cleaned and saved.
