# Data preparation
Importing the notebooks for preparing the data. The data preparation should combine different csv files into a single combined csv file.

In [13]:
import pandas as pd

# Directory containing the CSV files
csv_directory = './Air Quality dataset/NO2'  # Update this path to your folder containing CSV files
output_file = './combined_data_wide.csv'  # Output file path

# Initialize an empty DataFrame for combining data
combined_df = pd.DataFrame()


The following code should create a date variable, that stretches from the earliest recorded date to the latest

In [14]:
import os

# List to store all dates
all_dates = []

# Iterate through all CSV files in the directory
for file in os.listdir(csv_directory):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_directory, file)
        # Read the CSV file
        df = pd.read_csv(file_path)
        # Assuming the date column is named 'Date', adjust if necessary
        if 'DatoMaerke' in df.columns:
            all_dates.extend(pd.to_datetime(df['DatoMaerke']).tolist())

# Create a date range from the earliest to the latest date
if all_dates:
    min_date = min(all_dates)
    max_date = max(all_dates)
    date_range = pd.date_range(start=min_date, end=max_date)

    # Add the date range to the combined_df
    combined_df['Date'] = date_range

# Print the first few rows of the combined DataFrame
print("Combined DataFrame with Date Range:")
print(combined_df.head())

Combined DataFrame with Date Range:
        Date
0 2018-01-01
1 2018-01-02
2 2018-01-03
3 2018-01-04
4 2018-01-05


Now, the different NO2 values should be added. The NO2 values are stored in the column called 'NO2 ppb', and each new column should have the name of the file that it was extracted from.

In [15]:
# Iterate through all CSV files in the directory
for file in os.listdir(csv_directory):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_directory, file)
        # Read the CSV file
        df = pd.read_csv(file_path)
        # Ensure the date column is in datetime format
        df['DatoMaerke'] = pd.to_datetime(df['DatoMaerke'])
        # Set the date column as the index
        df.set_index('DatoMaerke', inplace=True)
        # Reindex to match the combined_df date range
        df = df.reindex(date_range)
        # Add the NO2 values as a new column in combined_df
        column_name = os.path.splitext(file)[0]  # Use the file name (without extension) as the column name
        combined_df[column_name] = df['NO2 ppb'].values

# Rename the columns to 'NO2_HCAB', 'NO2_JAGT', 'NO2_HCØ' and 'NO2_HVID'
combined_df = combined_df.rename(columns={combined_df.columns[1]: 'NO2_HCAB',
                       combined_df.columns[2]: 'NO2_JAGT',
                       combined_df.columns[3]: 'NO2_HCØ',
                       combined_df.columns[4]: 'NO2_HVID'})

# Print the first few rows of the updated combined DataFrame
print("Updated Combined DataFrame with NO2 Values:")
print(combined_df.head())

Updated Combined DataFrame with NO2 Values:
        Date   NO2_HCAB   NO2_JAGT   NO2_HCØ  NO2_HVID
0 2018-01-01   9.631655   8.225644  3.639999       NaN
1 2018-01-02  23.751219  16.441554  8.368438       NaN
2 2018-01-03  19.279095   6.512324  7.596542       NaN
3 2018-01-04  27.729076        NaN  9.598309       NaN
4 2018-01-05  26.783944        NaN  9.386578       NaN


The NOx should also be added in the same way as the NO2 data.

In [19]:
csv_directory_NOx = './Air Quality dataset/NOx' 

# Iterate through all CSV files in the directory
for file in os.listdir(csv_directory_NOx):
    if file.endswith('.csv'):
        file_path = os.path.join(csv_directory_NOx, file)
        # Read the CSV file
        df = pd.read_csv(file_path)
        # Ensure the date column is in datetime format
        df['DatoMaerke'] = pd.to_datetime(df['DatoMaerke'])
        # Set the date column as the index
        df.set_index('DatoMaerke', inplace=True)
        # Reindex to match the combined_df date range
        df = df.reindex(date_range)
        # Add the NO2 values as a new column in combined_df
        column_name = os.path.splitext(file)[0]  # Use the file name (without extension) as the column name
        combined_df[column_name] = df['NOx ppb'].values

# Rename the columns appropriately for NOx data
combined_df = combined_df.rename(columns={combined_df.columns[5]: 'NOx_HCAB',
                       combined_df.columns[6]: 'NOx_JAGT',
                       combined_df.columns[7]: 'NOx_HCØ',
                       combined_df.columns[8]: 'NOx_HVID'})

# Print the first few rows of the updated combined DataFrame
print("Updated Combined DataFrame with NOx Values:")
print(combined_df.head())

Updated Combined DataFrame with NOx Values:
        Date   NO2_HCAB   NO2_JAGT   NO2_HCØ  NO2_HVID   NOx_HCAB   NOx_JAGT  \
0 2018-01-01   9.631655   8.225644  3.639999       NaN  14.694634  14.820226   
1 2018-01-02  23.751219  16.441554  8.368438       NaN  53.248540  30.552647   
2 2018-01-03  19.279095   6.512324  7.596542       NaN  40.296651  10.973654   
3 2018-01-04  27.729076        NaN  9.598309       NaN  67.604004        NaN   
4 2018-01-05  26.783944        NaN  9.386578       NaN  78.055996        NaN   

     NOx_HCØ  NOx_HVID  
0   3.418484       NaN  
1   8.825228       NaN  
2   7.976738       NaN  
3  10.184947       NaN  
4  10.441514       NaN  


## Saving the data
Now, the new data should be saved as a new csv-file called combined_data.csv

In [20]:
combined_df.to_csv(output_file, index=False)
print(f"Data has been saved to {output_file}")

Data has been saved to ./combined_data_wide.csv
