# Preprocessing the data

### Loading required packages

In [3]:
import pandas as pd
import numpy as np
import boto3
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler

## Meteo data

### Load the meteo data

In [None]:
# Loading meteo data
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
years = ['2022']
base_url_meteo = 'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_{}{}.csv'

dfs = []

for year in years:
    for quarter in quarters:
        url = base_url_meteo.format(year, quarter)
        df = pd.read_csv(url)
        dfs.append(df)

### Build pipeline

In [58]:
## Building preprocessing pipeline
# Step 1: Concatenate datasets
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Convert UTC to CEST by adding 2 hours
def convert_utc_to_cest(df):
    df['DATEUTC'] = pd.to_datetime(df['DATEUTC'])
    df['DATECEST'] = df['DATEUTC']+ pd.Timedelta(hours=2)
    return df

# Steo 3: Update the month day hour columns to CEST
def convert_time(df):
    df['Month'] = df['DATECEST'].dt.month
    df['Day'] = df['DATECEST'].dt.day
    df['Hour'] = df['DATECEST'].dt.hour
    return df

# Step 4: Drop columns
def drop_columns(df):
    columns_to_keep = ['DATECEST', 'LC_RAININ', 'LC_DAILYRAIN', 'LC_WINDDIR', 'LC_WINDSPEED', 'LC_TEMP_QCL3', 'Month', 'Day', 'Hour']  #there's less columns we keep than drop
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Step 5: Check for percentage of missing values in each column
def print_null_percentage(df):
    null_percentage = df.isnull().sum() / len(df)
    print('The percentage of missing values in each column')
    print(null_percentage)
    return df

# Step 6: Forward fill missing values
def forward_fill(df):
    return df.ffill()

# Step 7: Check whether there are missing values left
def check_missing_values(df):
    missing_values = df.isnull().sum()
    print('Check whether there are missing values left')
    print(missing_values)
    return df

# Step 8: Calculate summary statistics for daily rain sum
def daily_rain_sum(df):
    summary_stats = df['LC_DAILYRAIN'].describe()
    print('Summary statistics for daily rain sum')
    print(summary_stats)
    return df

# Step 9: Calculate fraction of non-zero values in the 'LC_DAILYRAIN' column
def non_zero_fraction(df):
    nonzero_count = np.count_nonzero(df['LC_DAILYRAIN'])
    non_zero_frac = nonzero_count/len(df)
    print("Fraction of non-zero values:", non_zero_frac)
    return df

# Define the pipeline
pipeline_meteo = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('convert_utc_to_cest', FunctionTransformer(convert_utc_to_cest)),
    ('convert_time', FunctionTransformer(convert_time)),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('print_null_percentage', FunctionTransformer(print_null_percentage)),
    ('forward_fill', FunctionTransformer(forward_fill)),
    ('check_missing_values', FunctionTransformer(check_missing_values)),
    ('daily_rain_sum', FunctionTransformer(daily_rain_sum)),
    ('non_zero_fraction', FunctionTransformer(non_zero_fraction))
])

### Apply the pipeline and generate hourly, daily, and monthly meteo data

In [59]:
# Apply the pipeline
meteo_combined_df = pipeline_meteo.fit_transform(dfs)
meteo_combined_df.head()

The percentage of missing values in each column
LC_RAININ       0.056770
LC_DAILYRAIN    0.056770
LC_WINDDIR      0.056770
LC_WINDSPEED    0.056770
Month           0.000000
Day             0.000000
Hour            0.000000
LC_TEMP_QCL3    0.062285
DATECEST        0.000000
dtype: float64
Check whether there are missing values left
LC_RAININ       0
LC_DAILYRAIN    0
LC_WINDDIR      0
LC_WINDSPEED    0
Month           0
Day             0
Hour            0
LC_TEMP_QCL3    0
DATECEST        0
dtype: int64
Summary statistics for daily rain sum
count    5.546880e+06
mean     1.319783e-03
std      6.177559e-03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.540000e-01
Name: LC_DAILYRAIN, dtype: float64
Fraction of non-zero values: 0.17391488548517364


Unnamed: 0,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,Month,Day,Hour,LC_TEMP_QCL3,DATECEST
0,0.0,0.0,-169.0,0.43,1,1,2,13.048027,2022-01-01 02:10:00
1,0.0,0.0,-170.0,0.33,1,1,2,12.985849,2022-01-01 02:20:00
2,0.0,0.0,-167.0,0.46,1,1,2,12.950322,2022-01-01 02:30:00
3,0.0,0.0,-160.0,0.52,1,1,2,12.94955,2022-01-01 02:40:00
4,0.0,0.0,-166.0,0.51,1,1,2,12.952268,2022-01-01 02:50:00


In [60]:
# Create dataframe per hour

# Specify the aggregation function for each column
  # for LC_DAILYRAIN we take the last value because it's cumulative, for other columns the mean
aggregations = {
    'LC_DAILYRAIN': 'mean',  # Select the last value for 'LC_DAILYRAIN' ###TAKE MEAN FOR NOW TO MAKE THE GRAPHS LOOK OK
    'LC_RAININ': 'mean',  
    'LC_WINDDIR': 'mean',
    'LC_WINDDIR': 'mean', 
    'LC_WINDSPEED': 'mean', 
    'LC_TEMP_QCL3': 'mean'
}

# Perform the groupby aggregation
meteo_per_hour = meteo_combined_df.groupby(['Month', 'Day', 'Hour']).mean()
meteo_per_hour = meteo_per_hour.reset_index()
meteo_per_hour.head()

Unnamed: 0,Month,Day,Hour,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_TEMP_QCL3
0,1,1,0,2.3e-05,0.002997,-33.566358,1.487099,15.513391
1,1,1,1,1.9e-05,0.002174,-29.188272,1.465571,15.770757
2,1,1,2,3e-06,0.00036,-18.197324,0.389565,13.100358
3,1,1,3,7e-06,0.0,-16.227891,0.222602,12.669197
4,1,1,4,9e-06,0.0,-13.710884,0.217194,12.520271


In [61]:
# Create dataframe per day

# still the same "aggregations" as before
meteo_per_day = meteo_combined_df.groupby(['Month', 'Day']).mean()
meteo_per_day = meteo_per_day.reset_index()
meteo_per_day.head()

Unnamed: 0,Month,Day,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,Hour,LC_TEMP_QCL3
0,1,1,4e-06,0.000275,-7.446286,0.41486,11.400646,12.524393
1,1,2,0.000654,0.002608,-25.975694,0.649436,11.5,12.004777
2,1,3,0.000675,0.0066,-37.386338,0.711017,11.5,9.769569
3,1,4,0.000506,0.003867,-23.273101,0.344787,11.5,7.15832
4,1,5,8.9e-05,0.000738,-44.45316,0.603273,11.5,3.790048


In [62]:
# Create dataframe per month

# still the same "aggregations" as before
meteo_per_month = meteo_combined_df.groupby(['Month']).mean()
meteo_per_month = meteo_per_month.reset_index()
meteo_per_month.head()

Unnamed: 0,Month,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,Day,Hour,LC_TEMP_QCL3
0,1,0.000112,0.001034,-16.3077,0.339932,15.995544,11.496766,4.733596
1,2,0.000131,0.001263,-25.317653,0.74151,14.5,11.5,6.929743
2,3,1e-05,0.000104,12.215986,0.250748,16.0,11.5,8.108503
3,4,5.4e-05,0.000504,3.631846,0.369361,15.504041,11.503034,10.690818
4,5,7.6e-05,0.000653,-9.01314,0.240605,16.0,11.5,15.568973


In [63]:
'''
# export dataframes (only needs to be ran once so comment it out)
meteo_per_hour.to_csv('hourly_weatherdata_2022.csv', index=False)
meteo_per_day.to_csv('daily_weatherdata_2022.csv', index=False)
meteo_per_month.to_csv('monthly_weatherdata_2022.csv', index=False)
'''

In [64]:
# Delete dfs to reduce memory use
del dfs
del meteo_combined_df
del meteo_per_hour
del meteo_per_day
del meteo_per_month

## Noise level data

### Loading datasets

- January 

In [4]:
# Define a list of URLs
urls_jan = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_jan = []

# Loop through each URL and read the CSV into a DataFrame
for url_jan in urls_jan:
    df_jan = pd.read_csv(url_jan, header=0, sep=';')
    dfs_jan.append(df_jan)

# Now we have a list of DataFrames for each URL called dfs_jan

- February

In [5]:
# Define a list of URLs 
urls_feb = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_feb = []

# Loop through each URL and read the CSV into a DataFrame
for url_feb in urls_feb:
    df_feb = pd.read_csv(url_feb, header=0, sep=';')
    dfs_feb.append(df_feb)

# Now we have a list of DataFrames for each URL called dfs_feb

- March

In [6]:
# Define a list of URLs 
urls_mar = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_mar = []

# Loop through each URL and read the CSV into a DataFrame
for url_mar in urls_mar:
    df_mar = pd.read_csv(url_mar, header=0, sep=';')
    dfs_mar.append(df_mar)

- April

In [7]:
# Define a list of URLs 
urls_apr = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_apr = []

# Loop through each URL and read the CSV into a DataFrame
for url_apr in urls_apr:
    df_apr = pd.read_csv(url_apr, header=0, sep=';')
    dfs_apr.append(df_apr)

- May

In [8]:
# Define a list of URLs 
urls_may = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_may = []

# Loop through each URL and read the CSV into a DataFrame
for url_may in urls_may:
    df_may = pd.read_csv(url_may, header=0, sep=';')
    dfs_may.append(df_may)


- June

In [9]:
# Define a list of URLs 
url_jun = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_jun = []

# Loop through each URL and read the CSV into a DataFrame
for url_jun in url_jun:
    df_jun = pd.read_csv(url_jun, header=0, sep=';')
    dfs_jun.append(df_jun)

- July

In [10]:
# Define a list of URLs 
urls_jul = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_jul = []

# Loop through each URL and read the CSV into a DataFrame
for url_jul in urls_jul:
    df_jul = pd.read_csv(url_jul, header=0, sep=';')
    dfs_jul.append(df_jul)

- August

In [11]:
# Define a list of URLs 
urls_aug = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_aug = []

# Loop through each URL and read the CSV into a DataFrame
for url_aug in urls_aug:
    df_aug = pd.read_csv(url_aug, header=0, sep=';')
    dfs_aug.append(df_aug)

- September

In [12]:
# Define a list of URLs 
urls_sep = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_sep = []

# Loop through each URL and read the CSV into a DataFrame
for url_sep in urls_sep:
    df_sep = pd.read_csv(url_sep, header=0, sep=';')
    dfs_sep.append(df_sep)

- October

In [13]:
# Define a list of URLs 
urls_oct = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_oct = []

# Loop through each URL and read the CSV into a DataFrame
for url_oct in urls_oct:
    df_oct = pd.read_csv(url_oct, header=0, sep=';')
    dfs_oct.append(df_oct)

- November

In [14]:
# Define a list of URLs 
urls_nov = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_nov = []

# Loop through each URL and read the CSV into a DataFrame
for url_nov in urls_nov:
    df_nov = pd.read_csv(url_nov, header=0, sep=';')
    dfs_nov.append(df_nov)

- December

In [15]:
# Define a list of URLs 
urls_dec = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_dec = []

# Loop through each URL and read the CSV into a DataFrame
for url_dec in urls_dec:
    df_dec = pd.read_csv(url_dec, header=0, sep=';')
    dfs_dec.append(df_dec)

In [16]:
# List of datasets
dfs_2022 = [dfs_jan,dfs_feb,dfs_mar,dfs_apr,dfs_may,dfs_jun,dfs_jul,dfs_aug,dfs_sep,dfs_oct,dfs_nov,dfs_dec]

### Preprocessing for modelling purpose

##### Build pipeline

In [17]:
## A pipeline to generate data for modelling purpose
# Step 1: Concatenate datasets
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Convert timestamps to datetime
def convert_to_datetime(df):
    df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
    return df

# Step 3: Extract month, day, hour, minute from timestamps
def extract_time(df):
    df['month'] = df['result_timestamp'].dt.month
    df['day'] = df['result_timestamp'].dt.day
    df['hour'] = df['result_timestamp'].dt.hour
    #   df['minute'] = df['result_timestamp'].dt.minute
    return df

# Step 4: Drop columns
def drop_columns(df):
    columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Define the pipeline
pipeline_modelling = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('convert_to_datetime', FunctionTransformer(convert_to_datetime)),
    ('extract_time', FunctionTransformer(extract_time)),
    ('drop_columns', FunctionTransformer(drop_columns))
])

##### Apply the pipeline

In [None]:
# Apply the pipeline to get modelling dataset
transformed_modelling_datasets = []
for df_2022 in dfs_2022:
    transformed_modelling_dataset = pipeline_modelling.fit_transform(df_2022)
    transformed_modelling_datasets.append(transformed_modelling_dataset)

In [25]:
data_modelling = pd.concat(transformed_modelling_datasets, ignore_index=True)

: 

In [24]:
data_modelling.head(100000)

NameError: name 'data_modelling' is not defined

In [21]:
missing_values = data_modelling.isnull().sum()
print(missing_values)

description       0
lamax          1454
laeq             87
month             0
day               0
hour              0
dtype: int64


In [22]:
# exporting file (only needs to be run one time so comment it out)
data_modelling.to_csv('noise_for_modelling.csv', index=False)  

In [23]:
# delete for memory reduce
del data_modelling

### For visualization purpose

##### Build pipleines

- Overall dataset preprocessing pipeline

In [None]:
# Step 1: Concatenate datasets
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Convert timestamps to datetime
def convert_to_datetime(df):
    df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
    return df

# Step 3: Extract month, day, hour, minute from timestamps
def extract_time(df):
    df['month'] = df['result_timestamp'].dt.month
    df['day'] = df['result_timestamp'].dt.day
    df['hour'] = df['result_timestamp'].dt.hour
    #   df['minute'] = df['result_timestamp'].dt.minute
    return df

# Step 4: Drop columns
def drop_columns(df):
    columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Step 5: Forward fill missing values
def forward_fill(df):
    return df.ffill()

# Define the pipeline
pipeline_general = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('convert_to_datetime', FunctionTransformer(convert_to_datetime)),
    ('extract_time', FunctionTransformer(extract_time)),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('forward_fill', FunctionTransformer(forward_fill))
])

- Conintue to get hourly, daily, and monthly aggregated data

In [None]:
## Hourly pipeline
# Step 6: Perform groupby to create dataframe per hour
def perform_groupby(df):
    return df.groupby(['month', 'day', 'hour', 'description']).mean()

# Step 7: Reset index
def reset_index_func(df):
    return df.reset_index()

# Step 8: Standardize the data
def standardize_columns(df, columns_to_standardize):
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    new_columns = [column + '_standardized' for column in columns_to_standardize]
    df[new_columns] = pd.DataFrame(standardized_values, columns=new_columns)
    return df

# Step 9: Define a custom transformer to create the new column
class DateTransformer:
    def transform(self, df):
        df['year'] = 2022
        df['date'] = df.apply(lambda row: pd.to_datetime(f"{int(row['day']):02d}-{int(row['month']):02d}-{int(row['year']):04d}-{int(row['hour']):02d}", format='%d-%m-%Y-%H'), axis=1)
        df['date'] = df['date'].dt.strftime('%H:%M %d-%m-%Y')
        return df

    def fit(self, df, y=None):
        return self
    
# Step 10: Drop the year column
def drop_year_column(df):
    return df.drop(columns='year')

# Define the pipeline
pipeline_hourly = Pipeline([
    ('groupby', FunctionTransformer(perform_groupby)),
    ('reset_index', FunctionTransformer(reset_index_func)),
    ('standardize_columns', FunctionTransformer(standardize_columns, kw_args={'columns_to_standardize': ['lamax', 'laeq']})),
      ('date_transformer', DateTransformer()),
    ('drop_year_column', FunctionTransformer(drop_year_column))
])

In [None]:
## Daily pipeline
# Step 6: Perform groupby to create dataframe per hour
def perform_groupby(df):
    return df.groupby(['month', 'day', 'description']).mean()

# Step 7: Reset index
def reset_index_func(df):
    return df.reset_index()

# Step 8: Standardize the data
def standardize_columns(df, columns_to_standardize):
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    new_columns = [column + '_standardized' for column in columns_to_standardize]
    df[new_columns] = pd.DataFrame(standardized_values, columns=new_columns)
    return df

# Step 9: Drop unwanted columns
def drop_columns(df):
    return df.drop(columns='hour')

# Step 10: Define a custom transformer to create the new column
class DateTransformer:
    def transform(self, df):
        df['year'] = 2022
        df['date'] = df.apply(lambda row: pd.to_datetime(f"{int(row['day']):02d}-{int(row['month']):02d}-{int(row['year']):04d}", format='%d-%m-%Y'), axis=1)
        df['date'] = df['date'].dt.strftime('%d-%m-%Y')
        return df

    def fit(self, df, y=None):
        return self
    
# Step 11: Drop the year column
def drop_year_column(df):
    return df.drop(columns='year')

# Define the pipeline
pipeline_daily = Pipeline([
    ('groupby', FunctionTransformer(perform_groupby)),
    ('reset_index', FunctionTransformer(reset_index_func)),
    ('standardize_columns', FunctionTransformer(standardize_columns, kw_args={'columns_to_standardize': ['lamax', 'laeq']})),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('date_transformer', DateTransformer()),
    ('drop_year_column', FunctionTransformer(drop_year_column))
])

In [None]:
## Monthly pipeline
# Step 6: Perform groupby to create dataframe per hour
def perform_groupby(df):
    return df.groupby(['month', 'description']).mean()

# Step 7: Reset index
def reset_index_func(df):
    return df.reset_index()

# Step 8: Standardize the data
def standardize_columns(df, columns_to_standardize):
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    new_columns = [column + '_standardized' for column in columns_to_standardize]
    df[new_columns] = pd.DataFrame(standardized_values, columns=new_columns)
    return df

# Step 9: Drop unwanted columns
def drop_columns(df):
    columns_to_drop = ['day', 'hour']
    return df.drop(columns=columns_to_drop)

# Step 10: Define a custom transformer to create the new column
class DateTransformer:
    def transform(self, df):
        df['year'] = 2022
        df['date'] = df.apply(lambda row: pd.to_datetime(f"{int(row['month']):02d}-{int(row['year']):04d}", format='%m-%Y'), axis=1)
        df['date'] = df['date'].dt.strftime('%b %Y')
        return df

    def fit(self, df, y=None):
        return self
    
# Step 11: Drop the year column
def drop_year_column(df):
    return df.drop(columns='year')

# Define the pipeline
pipeline_monthly = Pipeline([
    ('groupby', FunctionTransformer(perform_groupby)),
    ('reset_index', FunctionTransformer(reset_index_func)),
    ('standardize_columns', FunctionTransformer(standardize_columns, kw_args={'columns_to_standardize': ['lamax', 'laeq']})),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('date_transformer', DateTransformer()),
    ('drop_year_column', FunctionTransformer(drop_year_column))
])

##### Apply the pipelines

In [None]:
# Apply the pipeline to the overall dataset
transformed_overall_datasets = []
for df_2022 in dfs_2022:
    transformed_overall_dataset = pipeline_general.fit_transform(df_2022)
    transformed_overall_datasets.append(transformed_overall_dataset)

combined = pd.concat(transformed_overall_datasets, ignore_index=True)
combined.head(100000)

In [None]:
# Apply the pipeline to the combined and collect hourly data
combined_hourly = pipeline_hourly.fit_transform(combined)

combined_hourly.head(100000)

In [None]:
# Apply the pipeline to the combined and collect daily data
combined_daily = pipeline_daily.fit_transform(combined)

combined_daily.head(100000)

In [None]:
# Apply the pipeline to the combined and collect monthly data
combined_monthly = pipeline_monthly.fit_transform(combined)

combined_monthly.head(100000)

In [None]:
# check whether there are missing values left
print(combined_hourly.isnull().sum())
print(combined_daily.isnull().sum())
print(combined_monthly.isnull().sum())

In [None]:
"""
# exporting file (only needs to be run one time so comment it out)
combined_hourly.to_csv('hourly_noisedata_2022.csv', index=False)  
combined_daily.to_csv('daily_noisedata_2022.csv', index=False) 
combined_monthly.to_csv('monthly_noisedata_2022.csv', index=False) 
"""

In [None]:
"""
# Delete the separate dataframes to minimize memory usage
del combined_hourly
del combined_daily
del combined_monthly
"""

## Noise events

#### Load events data

In [30]:
# Load the events data
event_mp01 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv',delimiter=';')
event_mp02 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255440_mp-02-naamsestraat-57-xior.csv',delimiter=';')
event_mp03 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255441_mp-03-naamsestraat-62-taste.csv',delimiter=';')
event_mp04 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_303910_mp-04-his-hears.csv',delimiter=';')
event_mp05 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255442_mp-05-calvariekapel-ku-leuven.csv',delimiter=';')
event_mp06 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255443_mp-06-parkstraat-2-la-filosovia.csv',delimiter=';')
event_mp07 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255444_mp-07-naamsestraat-81.csv',delimiter=';')
#event_mp08stadspark = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255445_mp-08-kiosk-stadspark.csv',delimiter=';')
event_mp08Vrijthof = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_280324_mp08bis---vrijthof.csv',delimiter=';')

#### Preprocess events data for merging purpose

- Build pipeline

In [85]:
# Step 1: Concatenate data
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Drop columns
def drop_columns(df):
    columns_to_keep = ['description', 'result_timestamp', 'noise_event_laeq_primary_detected_certainty', 'noise_event_laeq_primary_detected_class']
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Step 3: Add a column of certainty in percentage form (in string form) 
def percentage_column(df):
    df['certainty_percentage'] = df['noise_event_laeq_primary_detected_certainty'].apply(lambda x: f"{x}%" if not pd.isnull(x) else np.nan)
    return df

# Step 4: extract time from 'result_timestamp' 
def extract_time(df):
    df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
    df['month'] = df['result_timestamp'].dt.month
    df['day'] = df['result_timestamp'].dt.day
    df['hour'] = df['result_timestamp'].dt.hour
    df['minute'] = df['result_timestamp'].dt.minute
    df['second'] = df['result_timestamp'].dt.second
    df['milliseconds'] = df['result_timestamp'].dt.microsecond // 1000
    return df

# Define the pipeline
pipeline_merge_event = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('percentage_column', FunctionTransformer(percentage_column)),
    ('extract_time', FunctionTransformer(extract_time))
])

- Apply pipeline

In [86]:
# Concatenate the events data
events = [event_mp01,event_mp02,event_mp03,event_mp04,event_mp05,event_mp06,event_mp07,event_mp08Vrijthof] #mp08stadspark is not used in noise data

In [89]:
# Apply the pipeline
combined_event = pipeline_merge_event.fit_transform(events)
combined_event.head(1000000)

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds
0,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:27:21.737,,,,2,28,8,27,21,737
1,MP 01: Naamsestraat 35 Maxim,2022-02-28 13:58:21.356,,,,2,28,13,58,21,356
2,MP 01: Naamsestraat 35 Maxim,2022-02-28 16:43:15.393,,,,2,28,16,43,15,393
3,MP 01: Naamsestraat 35 Maxim,2022-02-28 19:22:48.428,,,,2,28,19,22,48,428
4,MP 01: Naamsestraat 35 Maxim,2022-02-28 20:32:20.440,,,,2,28,20,32,20,440
...,...,...,...,...,...,...,...,...,...,...,...
81051,MP08bis - Vrijthof,2022-12-29 09:08:11.171,99.0,Human voice - Shouting,99.0%,12,29,9,8,11,171
81052,MP08bis - Vrijthof,2022-12-30 13:54:27.224,99.0,Nature elements - Wind,99.0%,12,30,13,54,27,224
81053,MP08bis - Vrijthof,2022-12-30 13:56:57.225,0.0,Unsupported,0.0%,12,30,13,56,57,225
81054,MP08bis - Vrijthof,2022-12-30 15:09:33.233,100.0,Nature elements - Wind,100.0%,12,30,15,9,33,233


#### Preprocess noise data for merging purpose

In [36]:
# Pipeline on noise data for merging purpose
# Step 1: Convert timestamps to datetime
def convert_to_datetime(df):
    df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
    return df

# Step 2: Extract month, day, hour, minute from timestamps
def extract_time(df):
    df['month'] = df['result_timestamp'].dt.month
    df['day'] = df['result_timestamp'].dt.day
    df['hour'] = df['result_timestamp'].dt.hour
    df['minute'] = df['result_timestamp'].dt.minute
    df['second'] = df['result_timestamp'].dt.second
    df['milliseconds'] = df['result_timestamp'].dt.microsecond // 1000
    return df

# Step 3: Drop columns
def drop_columns(df):
    columns_to_keep = ['description', 'lamax', 'laeq', 'month','day','hour','minute','second','milliseconds']
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Step 4: Forward fill missing values
def forward_fill(df):
    return df.ffill()

# Define the pipeline
pipeline_merge_noise = Pipeline([
    ('convert_to_datetime', FunctionTransformer(convert_to_datetime)),
    ('extract_time', FunctionTransformer(extract_time)),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('forward_fill', FunctionTransformer(forward_fill))
])

#### Apply the pipeline and merge data month by month (to avoid the crash)

In [None]:
merge_columns = ['description','month','day','hour','minute','second','milliseconds']

- Jan

In [None]:
# Concatenate noise data
jan = pd.concat(dfs_jan, ignore_index=True)

In [22]:
# Merge the data
jan = pipeline_merge_noise.fit_transform(jan) # Apply pipeline to noise data
event_jan = combined_event[combined_event['month'] == 1] # Select event data from corresponding month
merged_event_jan = pd.merge(event_jan, jan, on=merge_columns,  how='left') # Merge the event and noise data

In [None]:
merged_event_jan.head()
#merged_event_jan.to_csv('merged_event_jan.csv', index=False)  

In [None]:
# Delete to reduce memory use
del jan
del event_jan
del merged_event_jan

- Feb

In [None]:
# Concatenate noise data
feb = pd.concat(dfs_feb, ignore_index=True)

In [17]:
# Merge the data
feb = pipeline_merge_noise.fit_transform(feb) # Apply pipeline to noise data
event_feb = combined_event[combined_event['month'] == 2] # Select event data from corresponding month
merged_event_feb = pd.merge(event_feb, feb, on=merge_columns,  how='left') # Merge the event and noise data


Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-02-28 08:27:21.737,,,,2,28,8,27,21,737,69.4,68.1
1,MP 01: Naamsestraat 35 Maxim,2022-02-28 13:58:21.356,,,,2,28,13,58,21,356,73.8,72.9
2,MP 01: Naamsestraat 35 Maxim,2022-02-28 16:43:15.393,,,,2,28,16,43,15,393,71.0,70.1
3,MP 01: Naamsestraat 35 Maxim,2022-02-28 19:22:48.428,,,,2,28,19,22,48,428,91.6,87.5
4,MP 01: Naamsestraat 35 Maxim,2022-02-28 20:32:20.440,,,,2,28,20,32,20,440,71.9,69.8


In [None]:
merged_event_feb.head()
#merged_event_feb.to_csv('merged_event_feb.csv', index=False)  

In [None]:
# Delete to reduce memory use
del feb
del event_feb
del merged_event_feb

- Mar

In [None]:
# Concatenate noise data
mar = pd.concat(dfs_mar, ignore_index=True)

In [None]:
# Merge the data
mar_ = pipeline_merge_noise.fit_transform(mar) # Apply pipeline to noise data
event_mar = combined_event[combined_event['month'] == 3] # Select event data from corresponding month
merged_event_mar = pd.merge(event_mar, mar_, on=merge_columns,  how='left') # Merge the event and noise data

In [23]:
merged_event_mar.head()
#merged_event_mar.to_csv('merged_event_mar.csv', index=False)  

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:07:59.463,,,,3,1,0,7,59,463,74.1,70.8
1,MP 01: Naamsestraat 35 Maxim,2022-03-01 01:24:17.470,,,,3,1,1,24,17,470,82.1,78.7
2,MP 01: Naamsestraat 35 Maxim,2022-03-01 01:33:16.470,,,,3,1,1,33,16,470,82.4,78.8
3,MP 01: Naamsestraat 35 Maxim,2022-03-01 02:31:01.476,,,,3,1,2,31,1,476,,
4,MP 01: Naamsestraat 35 Maxim,2022-03-01 03:45:02.482,,,,3,1,3,45,2,482,74.8,72.6


In [None]:
# Delete to reduce memory use
del mar
del event_mar
del merged_event_mar

- Apr

In [None]:
# Concatenate noise data
apr = pd.concat(dfs_apr, ignore_index=True)

In [25]:
# Merge the data
apr_ = pipeline_merge_noise.fit_transform(apr) # Apply pipeline to noise data
event_apr = combined_event[combined_event['month'] == 4] # Select event data from corresponding month
merged_event_apr = pd.merge(event_apr, apr_, on=merge_columns,  how='left') # Merge the event and noise data

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:24:52.570,97.0,Human voice - Shouting,97.0%,4,1,0,24,52,570,,
1,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:32:15.550,78.0,Transport road - Siren,78.0%,4,1,0,32,15,550,68.8,67.1
2,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:43:03.570,98.0,Human voice - Shouting,98.0%,4,1,0,43,3,570,79.8,74.9
3,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:49:51.570,100.0,Human voice - Shouting,100.0%,4,1,0,49,51,570,72.5,70.0
4,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:57:07.590,100.0,Human voice - Shouting,100.0%,4,1,0,57,7,590,,


In [None]:
merged_event_apr.head()
#merged_event_apr.to_csv('merged_event_apr.csv', index=False)  

In [None]:
# Delete to reduce memory use
del apr
del event_apr
del merged_event_apr

- May

In [None]:
# Concatenate noise data
may = pd.concat(dfs_may, ignore_index=True)

In [38]:
# Merge the data
may_ = pipeline_merge_noise.fit_transform(may) # Apply pipeline to noise data
event_may = combined_event[combined_event['month'] == 5] # Select event data from corresponding month
merged_event_may = pd.merge(event_may, may_, on=merge_columns,  how='left') # Merge the event and noise data

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-05-01 01:14:38.441,0.0,Unsupported,0.0%,5,1,1,14,38,441,,
1,MP 01: Naamsestraat 35 Maxim,2022-05-01 03:52:50.467,97.0,Transport road - Passenger car,97.0%,5,1,3,52,50,467,71.4,68.2
2,MP 01: Naamsestraat 35 Maxim,2022-05-01 08:23:25.636,98.0,Transport road - Passenger car,98.0%,5,1,8,23,25,636,70.4,69.2
3,MP 01: Naamsestraat 35 Maxim,2022-05-01 09:33:58.648,0.0,Unsupported,0.0%,5,1,9,33,58,648,72.7,71.0
4,MP 01: Naamsestraat 35 Maxim,2022-05-01 13:04:54.694,0.0,Unsupported,0.0%,5,1,13,4,54,694,69.7,68.6


In [None]:
merged_event_may.head()
#merged_event_may.to_csv('merged_event_may.csv', index=False)  

In [None]:
# Delete to reduce memory use
del may
del event_may
del merged_event_may

- Jun

In [None]:
# Concatenate noise data
jun = pd.concat(dfs_jun, ignore_index=True)

In [48]:
# Merge the data
jun_ = pipeline_merge_noise.fit_transform(jun)
event_jun = combined_event[combined_event['month'] == 6]
merged_event_jun = pd.merge(event_jun, jun_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-06-01 00:03:43.915,,,,6,1,0,3,43,915,65.5,64.7
1,MP 01: Naamsestraat 35 Maxim,2022-06-01 00:34:04.922,,,,6,1,0,34,4,922,,
2,MP 01: Naamsestraat 35 Maxim,2022-06-01 01:23:42.931,,,,6,1,1,23,42,931,,
3,MP 01: Naamsestraat 35 Maxim,2022-06-01 01:45:27.935,,,,6,1,1,45,27,935,77.5,75.2
4,MP 01: Naamsestraat 35 Maxim,2022-06-01 01:46:02.935,,,,6,1,1,46,2,935,69.5,65.8


In [None]:
merged_event_jun.head()
#merged_event_jun.to_csv('merged_event_jun.csv', index=False)  

In [None]:
# Delete to reduce memory use
del jun
del event_jun
del merged_event_jun

- Jul

In [None]:
# Concatenate noise data
jul = pd.concat(dfs_jul, ignore_index=True)

In [53]:
# Merge the data
jul_ = pipeline_merge_noise.fit_transform(jul)
event_jul = combined_event[combined_event['month'] == 7]
merged_event_jul = pd.merge(event_jul, jul_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-07-01 00:23:46.162,76.0,Transport road - Passenger car,76.0%,7,1,0,23,46,162,75.5,69.3
1,MP 01: Naamsestraat 35 Maxim,2022-07-01 00:45:47.166,99.0,Transport road - Passenger car,99.0%,7,1,0,45,47,166,,
2,MP 01: Naamsestraat 35 Maxim,2022-07-01 00:54:58.168,99.0,Transport road - Passenger car,99.0%,7,1,0,54,58,168,64.8,64.0
3,MP 01: Naamsestraat 35 Maxim,2022-07-01 01:16:09.174,100.0,Transport road - Passenger car,100.0%,7,1,1,16,9,174,,
4,MP 01: Naamsestraat 35 Maxim,2022-07-01 01:54:53.183,100.0,Transport road - Passenger car,100.0%,7,1,1,54,53,183,66.1,65.4


In [None]:
merged_event_jul.head()
#merged_event_jul.to_csv('merged_event_jul.csv', index=False)

In [None]:
# Delete to reduce memory use
del jul
del event_jul
del merged_event_jul

- Aug

In [None]:
# Concatenate noise data
aug = pd.concat(dfs_aug, ignore_index=True)

In [59]:
# Merge the data
aug_ = pipeline_merge_noise.fit_transform(aug)
event_aug = combined_event[combined_event['month'] == 8]
merged_event_aug = pd.merge(event_aug, aug_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-08-01 00:46:10.988,100.0,Transport road - Passenger car,100.0%,8,1,0,46,10,988,66.6,64.8
1,MP 01: Naamsestraat 35 Maxim,2022-08-01 00:51:07.989,0.0,Unsupported,0.0%,8,1,0,51,7,989,,
2,MP 01: Naamsestraat 35 Maxim,2022-08-01 00:54:51.990,99.0,Transport road - Passenger car,99.0%,8,1,0,54,51,990,,
3,MP 01: Naamsestraat 35 Maxim,2022-08-01 01:25:17.000,100.0,Transport road - Passenger car,100.0%,8,1,1,25,17,0,65.1,64.1
4,MP 01: Naamsestraat 35 Maxim,2022-08-01 01:32:56.200,93.0,Human voice - Shouting,93.0%,8,1,1,32,56,200,73.6,71.1


In [None]:
merged_event_aug.head()
#merged_event_aug.to_csv('merged_event_aug.csv', index=False)  

In [None]:
# Delete to reduce memory use
del aug
del event_aug
del merged_event_aug

- Sep

In [None]:
# Concatenate noise data
sep = pd.concat(dfs_sep, ignore_index=True)

In [62]:
# Merge the data
sep_ = pipeline_merge_noise.fit_transform(sep)
event_sep = combined_event[combined_event['month'] == 9]
merged_event_sep = pd.merge(event_sep, sep_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-09-01 00:25:42.664,100.0,Transport road - Passenger car,100.0%,9,1,0,25,42,664,67.2,65.9
1,MP 01: Naamsestraat 35 Maxim,2022-09-01 00:38:50.668,96.0,Transport road - Passenger car,96.0%,9,1,0,38,50,668,67.9,66.0
2,MP 01: Naamsestraat 35 Maxim,2022-09-01 00:39:32.668,100.0,Transport road - Passenger car,100.0%,9,1,0,39,32,668,65.3,64.4
3,MP 01: Naamsestraat 35 Maxim,2022-09-01 01:57:50.692,100.0,Transport road - Passenger car,100.0%,9,1,1,57,50,692,67.7,65.8
4,MP 01: Naamsestraat 35 Maxim,2022-09-01 02:03:06.694,100.0,Transport road - Passenger car,100.0%,9,1,2,3,6,694,66.6,64.1


In [None]:
merged_event_sep.head()
#merged_event_sep.to_csv('merged_event_sep.csv', index=False)  

In [None]:
# Delete to reduce memory use
del sep
del event_sep
del merged_event_sep

- Oct

In [None]:
# Concatenate noise data
oct = pd.concat(dfs_oct, ignore_index=True)

In [65]:
# Merge the data
oct_ = pipeline_merge_noise.fit_transform(oct)
event_oct = combined_event[combined_event['month'] == 10]
merged_event_oct = pd.merge(event_oct, oct_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-10-01 00:13:05.578,100.0,Transport road - Passenger car,100.0%,10,1,0,13,5,578,66.0,65.4
1,MP 01: Naamsestraat 35 Maxim,2022-10-01 01:00:04.589,100.0,Human voice - Shouting,100.0%,10,1,1,0,4,589,72.3,68.0
2,MP 01: Naamsestraat 35 Maxim,2022-10-01 01:01:19.590,100.0,Human voice - Shouting,100.0%,10,1,1,1,19,590,74.8,69.9
3,MP 01: Naamsestraat 35 Maxim,2022-10-01 01:01:27.590,0.0,Unsupported,0.0%,10,1,1,1,27,590,77.9,74.2
4,MP 01: Naamsestraat 35 Maxim,2022-10-01 01:03:06.590,87.0,Human voice - Shouting,87.0%,10,1,1,3,6,590,70.5,67.9


In [None]:
merged_event_oct.head()
#merged_event_oct.to_csv('merged_event_oct.csv', index=False)  

In [None]:
# Delete to reduce memory use
del oct
del event_oct
del merged_event_oct

- Nov

In [None]:
# Concatenate noise data
nov = pd.concat(dfs_nov, ignore_index=True)

In [77]:
# Merge the data
nov_ = pipeline_merge_noise.fit_transform(nov)
event_nov = combined_event[combined_event['month'] == 11]
merged_event_nov = pd.merge(event_nov, nov_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-11-02 10:14:52.835,100.0,Transport road - Passenger car,100.0%,11,2,10,14,52,835,64.8,64.1
1,MP 01: Naamsestraat 35 Maxim,2022-11-02 10:20:30.837,0.0,Unsupported,0.0%,11,2,10,20,30,837,74.5,67.2
2,MP 01: Naamsestraat 35 Maxim,2022-11-02 10:33:00.840,79.0,Transport road - Passenger car,79.0%,11,2,10,33,0,840,68.8,66.5
3,MP 01: Naamsestraat 35 Maxim,2022-11-02 10:36:26.841,95.0,Transport road - Passenger car,95.0%,11,2,10,36,26,841,65.5,64.6
4,MP 01: Naamsestraat 35 Maxim,2022-11-02 10:42:15.843,0.0,Unsupported,0.0%,11,2,10,42,15,843,82.8,77.0


In [None]:
merged_event_nov.head()
#merged_event_nov.to_csv('merged_event_nov.csv', index=False)  

In [None]:
# Delete to reduce memory use
del nov
del event_nov
del merged_event_nov

- Dec

In [None]:
# Concatenate noise data
dec = pd.concat(dfs_dec, ignore_index=True)

In [79]:
# Merge the data
dec_ = pipeline_merge_noise.fit_transform(dec)
event_dec = combined_event[combined_event['month'] == 12]
merged_event_dec = pd.merge(event_dec, dec_, on=merge_columns,  how='left')

Unnamed: 0,description,result_timestamp,noise_event_laeq_primary_detected_certainty,noise_event_laeq_primary_detected_class,certainty_percentage,month,day,hour,minute,second,milliseconds,lamax,laeq
0,MP 01: Naamsestraat 35 Maxim,2022-12-01 00:00:55.398,0.0,Unsupported,0.0%,12,1,0,0,55,398,82.3,77.9
1,MP 01: Naamsestraat 35 Maxim,2022-12-01 00:01:24.398,98.0,Transport road - Passenger car,98.0%,12,1,0,1,24,398,69.6,66.7
2,MP 01: Naamsestraat 35 Maxim,2022-12-01 00:07:42.399,79.0,Transport road - Passenger car,79.0%,12,1,0,7,42,399,67.8,66.7
3,MP 01: Naamsestraat 35 Maxim,2022-12-01 00:08:30.399,100.0,Human voice - Shouting,100.0%,12,1,0,8,30,399,,
4,MP 01: Naamsestraat 35 Maxim,2022-12-01 00:23:42.402,99.0,Human voice - Shouting,99.0%,12,1,0,23,42,402,76.7,72.0


In [None]:
merged_event_dec.head()
#merged_event_dec.to_csv('merged_event_dec.csv', index=False)  

In [None]:
# Delete to reduce memory use
del dec
del event_dec
del merged_event_dec

#### Old preprocessing

- Jan

In [1]:
# Combining the datasets for January
#combined_jan = pd.concat(dfs, ignore_index=True)
#print(combined_jan.head())

#del dfs # deleting the separate dataframes to minimize memory usage


# extract the month, day, hour, minute of "result_timestamp"
#combined_jan['result_timestamp'] = pd.to_datetime(combined_jan['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_jan['month'] = combined_jan['result_timestamp'].dt.month
#combined_jan['day'] = combined_jan['result_timestamp'].dt.day
#combined_jan['hour'] = combined_jan['result_timestamp'].dt.hour
#combined_jan['minute'] = combined_jan['result_timestamp'].dt.minute

#combined_jan.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_jan.columns) - set(columns_to_keep)
#combined_jan.drop(columns=columns_to_drop, inplace=True)
#combined_jan.head()


# check for missing values in each column
#print(combined_jan.isnull().sum())


# forward fill missing values 
#combined_jan.ffill(inplace=True)

# check whether there are missing values left
#print(combined_jan.isnull().sum())


# Create dataframe per hour
#jan_per_hour = combined_jan.groupby(['month', 'day', 'hour', 'description']).mean()
#jan_per_hour = jan_per_hour.reset_index()
#jan_per_hour.head()


# Create dataframe per day
#combined_jan.drop('hour', axis=1, inplace=True)
#jan_per_day= combined_jan.groupby(['month', 'day', 'description']).mean()
#jan_per_day = jan_per_day.reset_index()
#jan_per_day.head()


# Create dataframe per month
#combined_jan.drop('day', axis=1, inplace=True)
#jan_per_month = combined_jan.groupby(['month', 'description']).mean()
#jan_per_month = jan_per_month.reset_index()
#jan_per_month.head()


#del combined_jan

- Feb

In [2]:
# Combining the datasets for February
#combined_feb = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_feb['result_timestamp'] = pd.to_datetime(combined_feb['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_feb['month'] = combined_feb['result_timestamp'].dt.month
#combined_feb['day'] = combined_feb['result_timestamp'].dt.day
#combined_feb['hour'] = combined_feb['result_timestamp'].dt.hour
#combined_feb['minute'] = combined_feb['result_timestamp'].dt.minute

#combined_feb.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_feb.columns) - set(columns_to_keep)
#combined_feb.drop(columns=columns_to_drop, inplace=True)
#combined_feb.head()


# check for missing values in each column
#print(combined_feb.isnull().sum())


# Create dataframe per hour
#feb_per_hour = combined_feb.groupby(['month', 'day', 'hour', 'description']).mean()
#feb_per_hour = feb_per_hour.reset_index()
#print(feb_per_hour.head())

# Create dataframe per day
#combined_feb.drop('hour', axis=1, inplace=True)
#feb_per_day = combined_feb.groupby(['month', 'day', 'description']).mean()
#feb_per_day = feb_per_day.reset_index()
#print(feb_per_day.head())

# Create dataframe per month
#combined_feb.drop('day', axis=1, inplace=True)
#feb_per_month = combined_feb.groupby(['month', 'description']).mean()
#feb_per_month = feb_per_month.reset_index()
#print(feb_per_month.head())

#del combined_feb

- March

In [20]:
# Combining the datasets for March
#combined_mar = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_mar['result_timestamp'] = pd.to_datetime(combined_mar['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_mar['month'] = combined_mar['result_timestamp'].dt.month
#combined_mar['day'] = combined_mar['result_timestamp'].dt.day
#combined_mar['hour'] = combined_mar['result_timestamp'].dt.hour
#combined_mar['minute'] = combined_mar['result_timestamp'].dt.minute

#combined_mar.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_mar.columns) - set(columns_to_keep)
#combined_mar.drop(columns=columns_to_drop, inplace=True)
#combined_mar.head()


# check for missing values in each column
#print(combined_mar.isnull().sum())


# forward fill missing values 
#combined_mar.ffill(inplace=True)

# check whether there are missing values left
#print(combined_mar.isnull().sum())


# Create dataframe per hour
#mar_per_hour = combined_mar.groupby(['month', 'day', 'hour', 'description']).mean()
#mar_per_hour = mar_per_hour.reset_index()
#print(mar_per_hour.head())

# Create dataframe per day
#combined_mar.drop('hour', axis=1, inplace=True)
#mar_per_day = combined_mar.groupby(['month', 'day', 'description']).mean()
#mar_per_day = mar_per_day.reset_index()
#print(mar_per_day.head())

# Create dataframe per month
#combined_mar.drop('day', axis=1, inplace=True)
#mar_per_month = combined_mar.groupby(['month', 'description']).mean()
#mar_per_month = mar_per_month.reset_index()
#print(mar_per_month.head())

#del combined_mar

Unnamed: 0,#object_id,description,result_timestamp,lamax,lamax_unit,laeq,laeq_unit,lceq,lceq_unit,lcpeak,lcpeak_unit,month,day,hour
0,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:00.462,60.5,dB(A),57.9,dB(A),63.36,dB(C),76.57,dB(C),3,1,0
1,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:01.462,54.1,dB(A),53.2,dB(A),61.86,dB(C),74.52,dB(C),3,1,0
2,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:02.462,61.4,dB(A),57.5,dB(A),64.12,dB(C),76.46,dB(C),3,1,0
3,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:03.462,61.3,dB(A),59.1,dB(A),64.68,dB(C),76.67,dB(C),3,1,0
4,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:04.462,61.1,dB(A),58.4,dB(A),64.53,dB(C),77.07,dB(C),3,1,0


- April

In [26]:
# Combining the datasets 
#combined_apr = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_apr['result_timestamp'] = pd.to_datetime(combined_apr['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_apr['month'] = combined_apr['result_timestamp'].dt.month
#combined_apr['day'] = combined_apr['result_timestamp'].dt.day
#combined_apr['hour'] = combined_apr['result_timestamp'].dt.hour
#combined_apr['minute'] = combined_apr['result_timestamp'].dt.minute

#combined_apr.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_apr.columns) - set(columns_to_keep)
#combined_apr.drop(columns=columns_to_drop, inplace=True)
#combined_apr.head()


# check for missing values in each column
#print(combined_apr.isnull().sum() / len(combined_apr))


# forward fill missing values 
#combined_apr.ffill(inplace=True)

# check whether there are missing values left
#print(combined_apr.isnull().sum())


# Create dataframe per hour
#apr_per_hour = combined_apr.groupby(['month', 'day', 'hour', 'description']).mean()
#apr_per_hour = apr_per_hour.reset_index()
#print(apr_per_hour.head())

# Create dataframe per day
#combined_apr.drop('hour', axis=1, inplace=True)
#apr_per_day = combined_apr.groupby(['month', 'day', 'description']).mean()
#apr_per_day = apr_per_day.reset_index()
#print(apr_per_day.head())

# Create dataframe per month
#combined_apr.drop('day', axis=1, inplace=True)
#apr_per_month = combined_apr.groupby(['month', 'description']).mean()
#apr_per_month = apr_per_month.reset_index()
#print(apr_per_month.head())

#del combined_apr

Unnamed: 0,#object_id,description,result_timestamp,lamax,lamax_unit,laeq,laeq_unit,lceq,lceq_unit,lcpeak,lcpeak_unit,month,day,hour
0,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:00.520,62.5,dB(A),59.3,dB(A),64.56,dB(C),78.13,dB(C),4,1,0
1,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:01.520,63.3,dB(A),61.3,dB(A),65.35,dB(C),78.09,dB(C),4,1,0
2,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:02.520,61.4,dB(A),59.1,dB(A),64.18,dB(C),76.63,dB(C),4,1,0
3,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:03.510,58.9,dB(A),56.6,dB(A),63.58,dB(C),75.74,dB(C),4,1,0
4,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:04.510,59.6,dB(A),57.2,dB(A),63.32,dB(C),77.86,dB(C),4,1,0


- May

In [2]:
# Combining the datasets 
#combined_may = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_may['result_timestamp'] = pd.to_datetime(combined_may['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_may['month'] = combined_may['result_timestamp'].dt.month
#combined_may['day'] = combined_may['result_timestamp'].dt.day
#combined_may['hour'] = combined_may['result_timestamp'].dt.hour
#combined_may['minute'] = combined_may['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_may.columns) - set(columns_to_keep)
#combined_may.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_may.isnull().sum())


# forward fill missing values 
#combined_may.ffill(inplace=True)

# check whether there are missing values left
#print(combined_may.isnull().sum())


# Create dataframe per hour
#may_per_hour = combined_may.groupby(['month', 'day', 'hour', 'description']).mean()
#may_per_hour = may_per_hour.reset_index()
#print(may_per_hour.head())

# Create dataframe per day
#combined_may.drop('hour', axis=1, inplace=True)
#may_per_day = combined_may.groupby(['month', 'day', 'description']).mean()
#may_per_day = may_per_day.reset_index()
#print(may_per_day.head())

# Create dataframe per month
#combined_may.drop('day', axis=1, inplace=True)
#may_per_month = combined_may.groupby(['month', 'description']).mean()
#may_per_month = may_per_month.reset_index()
#print(may_per_month.head())

#del combined_may

description     0
lamax          11
laeq           11
month           0
day             0
hour            0
dtype: int64


- June

In [5]:
# Combining the datasets 
#combined_jun = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_jun['result_timestamp'] = pd.to_datetime(combined_jun['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_jun['month'] = combined_jun['result_timestamp'].dt.month
#combined_jun['day'] = combined_jun['result_timestamp'].dt.day
#combined_jun['hour'] = combined_jun['result_timestamp'].dt.hour
#combined_jun['minute'] = combined_jun['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_jun.columns) - set(columns_to_keep)
#combined_jun.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_jun.isnull().sum())


# Create dataframe per hour
#jun_per_hour = combined_jun.groupby(['month', 'day', 'hour', 'description']).mean()
#jun_per_hour = jun_per_hour.reset_index()
#print(jun_per_hour.head())

# Create dataframe per day
#combined_jun.drop('hour', axis=1, inplace=True)
#jun_per_day = combined_jun.groupby(['month', 'day', 'description']).mean()
#jun_per_day = jun_per_day.reset_index()
#print(jun_per_day.head())

# Create dataframe per month
#combined_jun.drop('day', axis=1, inplace=True)
#jun_per_month = combined_jun.groupby(['month', 'description']).mean()
#jun_per_month = jun_per_month.reset_index()
#print(jun_per_month.head())

#del combined_jun

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


- July

In [7]:
# Combining the datasets for 
#combined_jul = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_jul['result_timestamp'] = pd.to_datetime(combined_jul['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_jul['month'] = combined_jul['result_timestamp'].dt.month
#combined_jul['day'] = combined_jul['result_timestamp'].dt.day
#combined_jul['hour'] = combined_jul['result_timestamp'].dt.hour
#combined_jul['minute'] = combined_jul['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_jul.columns) - set(columns_to_keep)
#combined_jul.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_jul.isnull().sum())


# forward fill missing values 
#combined_jul.ffill(inplace=True)

# check whether there are missing values left
#print(combined_jul.isnull().sum())


# Create dataframe per hour
#jul_per_hour = combined_jul.groupby(['month', 'day', 'hour', 'description']).mean()
#jul_per_hour = jul_per_hour.reset_index()
#print(jul_per_hour.head())

# Create dataframe per day
#combined_jul.drop('hour', axis=1, inplace=True)
#jul_per_day = combined_jul.groupby(['month', 'day', 'description']).mean()
#jul_per_day = jul_per_day.reset_index()
#print(jul_per_day.head())

# Create dataframe per month
#combined_jul.drop('day', axis=1, inplace=True)
#jul_per_month = combined_jul.groupby(['month', 'description']).mean()
#jul_per_month = jul_per_month.reset_index()
#print(jul_per_month.head())

#del combined_jul

description    0
lamax          5
laeq           5
month          0
day            0
hour           0
dtype: int64


- August

In [10]:
# Combining the datasets for
#combined_aug = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_aug['result_timestamp'] = pd.to_datetime(combined_aug['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_aug['month'] = combined_aug['result_timestamp'].dt.month
#combined_aug['day'] = combined_aug['result_timestamp'].dt.day
#combined_aug['hour'] = combined_aug['result_timestamp'].dt.hour
#combined_aug['minute'] = combined_aug['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_aug.columns) - set(columns_to_keep)
#combined_aug.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_aug.isnull().sum())


# Create dataframe per hour
#aug_per_hour = combined_aug.groupby(['month', 'day', 'hour', 'description']).mean()
#aug_per_hour = aug_per_hour.reset_index()
#print(aug_per_hour.head())

# Create dataframe per day
#combined_aug.drop('hour', axis=1, inplace=True)
#aug_per_day = combined_aug.groupby(['month', 'day', 'description']).mean()
#aug_per_day = aug_per_day.reset_index()
#print(aug_per_day.head())

# Create dataframe per month
#combined_aug.drop('day', axis=1, inplace=True)
#aug_per_month = combined_aug.groupby(['month', 'description']).mean()
#aug_per_month = aug_per_month.reset_index()
#print(aug_per_month.head())

#del combined_aug

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


- September

In [12]:
'''
# Combining the datasets 
combined_sep = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_sep['result_timestamp'] = pd.to_datetime(combined_sep['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_sep['month'] = combined_sep['result_timestamp'].dt.month
combined_sep['day'] = combined_sep['result_timestamp'].dt.day
combined_sep['hour'] = combined_sep['result_timestamp'].dt.hour
#combined_sep['minute'] = combined_sep['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_sep.columns) - set(columns_to_keep)
combined_sep.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_sep.isnull().sum())


# forward fill missing values 
combined_sep.ffill(inplace=True)

# check whether there are missing values left
print(combined_sep.isnull().sum())


# Create dataframe per hour
sep_per_hour = combined_sep.groupby(['month', 'day', 'hour', 'description']).mean()
sep_per_hour = sep_per_hour.reset_index()
print(sep_per_hour.head())

# Create dataframe per day
combined_sep.drop('hour', axis=1, inplace=True)
sep_per_day = combined_sep.groupby(['month', 'day', 'description']).mean()
sep_per_day = sep_per_day.reset_index()
print(sep_per_day.head())

# Create dataframe per month
combined_sep.drop('day', axis=1, inplace=True)
sep_per_month = combined_sep.groupby(['month', 'description']).mean()
sep_per_month = sep_per_month.reset_index()
print(sep_per_month.head())

del combined_sep
'''

description     0
lamax           2
laeq           12
month           0
day             0
hour            0
dtype: int64


- October

In [15]:
'''
# Combining the datasets 
combined_oct = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_oct['result_timestamp'] = pd.to_datetime(combined_oct['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_oct['month'] = combined_oct['result_timestamp'].dt.month
combined_oct['day'] = combined_oct['result_timestamp'].dt.day
combined_oct['hour'] = combined_oct['result_timestamp'].dt.hour
#combined_oct['minute'] = combined_oct['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_oct.columns) - set(columns_to_keep)
combined_oct.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_oct.isnull().sum())


# forward fill missing values 
combined_oct.ffill(inplace=True)

# check whether there are missing values left
print(combined_oct.isnull().sum())


# Create dataframe per hour
oct_per_hour = combined_oct.groupby(['month', 'day', 'hour', 'description']).mean()
oct_per_hour = oct_per_hour.reset_index()
print(oct_per_hour.head())

# Create dataframe per day
combined_oct.drop('hour', axis=1, inplace=True)
oct_per_day = combined_oct.groupby(['month', 'day', 'description']).mean()
oct_per_day = oct_per_day.reset_index()
print(oct_per_day.head())

# Create dataframe per month
combined_oct.drop('day', axis=1, inplace=True)
oct_per_month = combined_oct.groupby(['month', 'description']).mean()
oct_per_month = oct_per_month.reset_index()
print(oct_per_month.head())

del combined_oct
'''

description     0
lamax          10
laeq           10
month           0
day             0
hour            0
dtype: int64


- November

In [18]:
''''
# Combining the datasets 
combined_nov = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_nov['result_timestamp'] = pd.to_datetime(combined_nov['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_nov['month'] = combined_nov['result_timestamp'].dt.month
combined_nov['day'] = combined_nov['result_timestamp'].dt.day
combined_nov['hour'] = combined_nov['result_timestamp'].dt.hour
#combined_nov['minute'] = combined_nov['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_nov.columns) - set(columns_to_keep)
combined_nov.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_nov.isnull().sum())


# Create dataframe per hour
nov_per_hour = combined_nov.groupby(['month', 'day', 'hour', 'description']).mean()
nov_per_hour = nov_per_hour.reset_index()
print(nov_per_hour.head())

# Create dataframe per day
combined_nov.drop('hour', axis=1, inplace=True)
nov_per_day = combined_nov.groupby(['month', 'day', 'description']).mean()
nov_per_day = nov_per_day.reset_index()
print(nov_per_day.head())

# Create dataframe per month
combined_nov.drop('day', axis=1, inplace=True)
nov_per_month = combined_nov.groupby(['month', 'description']).mean()
nov_per_month = nov_per_month.reset_index()
print(nov_per_month.head())

del combined_nov
'''

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


- December

In [20]:
'''

# Combining the datasets 
combined_dec = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_dec['result_timestamp'] = pd.to_datetime(combined_dec['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_dec['month'] = combined_dec['result_timestamp'].dt.month
combined_dec['day'] = combined_dec['result_timestamp'].dt.day
combined_dec['hour'] = combined_dec['result_timestamp'].dt.hour
#combined_dec['minute'] = combined_dec['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_dec.columns) - set(columns_to_keep)
combined_dec.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_dec.isnull().sum())


# Create dataframe per hour
dec_per_hour = combined_dec.groupby(['month', 'day', 'hour', 'description']).mean()
dec_per_hour = dec_per_hour.reset_index()
print(dec_per_hour.head())

# Create dataframe per day
combined_dec.drop('hour', axis=1, inplace=True)
dec_per_day = combined_dec.groupby(['month', 'day', 'description']).mean()
dec_per_day = dec_per_day.reset_index()
print(dec_per_day.head())

# Create dataframe per month
combined_dec.drop('day', axis=1, inplace=True)
dec_per_month = combined_dec.groupby(['month', 'description']).mean()
dec_per_month = dec_per_month.reset_index()
print(dec_per_month.head())

del combined_dec
'''

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


Now that we have exported the preprocessed dataframes for the noise and weather data of 2022, we can just use these files instead of loading all 112 files from the S3 bucket each time, as this takes a lot of time.

## OLD PREPROCESSING

### Reading in the data from the S3 bucket (don't forget to pip install boto3)

In [None]:
# # meteo data
# Q1_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q1.csv')
# Q2_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q2.csv')
# Q3_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q3.csv')
# Q4_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q4.csv')

In [None]:
# REMARK: this is the 'old' noise data, don't run this

# noise data
# exp40_naamse35 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# exp40_naamse57 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# exp40_naamse62 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# exp40_calvarie = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# exp40_naamse81 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# exp40_park = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# exp40_kiosk = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# exp40_vrijt = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# exp40_his = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_303910_mp-04-his-hears.csv', header=0, sep=';')

# exp41_naamse35 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# exp41_naamse57 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# exp41_naamse62 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# exp41_calvarie = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# exp41_naamse81 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# exp41_park = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# exp41_kiosk = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# exp41_vrijt = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# exp41_his = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_303910_mp-04-his-hears.csv', header=0, sep=';')

# exp42_naamse35 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# exp42_naamse57 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# exp42_naamse62 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# exp42_calvarie = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# exp42_naamse81 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# exp42_park = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# exp42_kiosk = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# exp42_vrijt = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# exp42_his = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_303910_mp-04-his-hears.csv', header=0, sep=';')


In [None]:
# noise_columns = ["#object_id", "description", "result_timestamp", "lamax", "laeq"]
# naamse35_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';', usecols=noise_columns)

In [None]:
# # updated noise data - January
# naamse35_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - February
# naamse35_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - March
# naamse35_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - April
# naamse35_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - May
# naamse35_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - June
# naamse35_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - July
# naamse35_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - August
# naamse35_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - September
# naamse35_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - October
# naamse35_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - November
# naamse35_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - December
# naamse35_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_303910_mp-04-his-hears.csv', header=0, sep=';')

### Combining and aggregating the meteo data

In [None]:
# # combine meteo dataset 
# meteocombined = pd.concat([Q1_2022, Q2_2022, Q3_2022, Q4_2022], axis=0)
# meteocombined.head()

In [None]:
# check for missing values in each column
# print(meteocombined.isnull().sum())

In [None]:
# # aggregate meteo data by day
# avg_meteo_combined = meteocombined.groupby(['Year','Month', 'Day']).mean()
# avg_meteo_combined = avg_meteo_combined.reset_index()
# avg_meteo_combined.head()


In [None]:
# month_max_value = avg_meteo_combined['Month'].max()
# print(f"This combined meteo dataset contains the weather data for all {month_max_value} months.")

### Combining and aggregating the noise data

- January

In [None]:
# # combine noise data for January together
# noise_jan_combined = pd.concat([naamse35_jan, naamse57_jan, naamse62_jan, calvarie_jan, park_jan, naamse81_jan, kiosk_jan, vrijt_jan, his_jan], axis=0)
# noise_jan_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_jan_combined['result_timestamp'] = pd.to_datetime(noise_jan_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_jan_combined['result_date'] = noise_jan_combined['result_timestamp'].dt.date
# noise_jan_combined['result_month'] = noise_jan_combined['result_timestamp'].dt.month
# noise_jan_combined['result_day'] = noise_jan_combined['result_timestamp'].dt.day
# noise_jan_combined['result_hour'] = noise_jan_combined['result_timestamp'].dt.hour
# noise_jan_combined['result_minute'] = noise_jan_combined['result_timestamp'].dt.minute

# noise_jan_combined.head()

In [None]:
# # aggregate the data by day
# avg_jan_combined = noise_jan_combined.groupby(['result_date','description']).mean()
# avg_jan_combined = avg_jan_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_jan_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_jan_combined.head(150)

In [None]:
# check for missing values in each column
# print(avg_jan_combined.isnull().sum())

- February

In [None]:
# # combine noise data for February together
# noise_feb_combined = pd.concat([naamse35_feb, naamse57_feb, naamse62_feb, calvarie_feb, park_feb, naamse81_feb, kiosk_feb, vrijt_feb, his_feb], axis=0)
# noise_feb_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_feb_combined['result_timestamp'] = pd.to_datetime(noise_feb_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_feb_combined['result_date'] = noise_feb_combined['result_timestamp'].dt.date
# noise_feb_combined['result_month'] = noise_feb_combined['result_timestamp'].dt.month
# noise_feb_combined['result_day'] = noise_feb_combined['result_timestamp'].dt.day
# noise_feb_combined['result_hour'] = noise_feb_combined['result_timestamp'].dt.hour
# noise_feb_combined['result_minute'] = noise_feb_combined['result_timestamp'].dt.minute

# noise_feb_combined.head()

In [None]:
# # aggregate the data by day
# avg_feb_combined = noise_feb_combined.groupby(['result_date','description']).mean()
# avg_feb_combined = avg_feb_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_feb_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_feb_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_feb_combined.isnull().sum())

- March

In [None]:
# # combine noise data for March together
# noise_mar_combined = pd.concat([naamse35_mar, naamse57_mar, naamse62_mar, calvarie_mar, park_mar, naamse81_mar, kiosk_mar, vrijt_mar, his_mar], axis=0)
# noise_mar_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_mar_combined['result_timestamp'] = pd.to_datetime(noise_mar_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_mar_combined['result_date'] = noise_mar_combined['result_timestamp'].dt.date
# noise_mar_combined['result_month'] = noise_mar_combined['result_timestamp'].dt.month
# noise_mar_combined['result_day'] = noise_mar_combined['result_timestamp'].dt.day
# noise_mar_combined['result_hour'] = noise_mar_combined['result_timestamp'].dt.hour
# noise_mar_combined['result_minute'] = noise_mar_combined['result_timestamp'].dt.minute

# noise_mar_combined.head()

In [None]:
# # aggregate the data by day
# avg_mar_combined = noise_mar_combined.groupby(['result_date','description']).mean()
# avg_mar_combined = avg_mar_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_mar_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_mar_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_mar_combined.isnull().sum())

- April 

In [None]:
# # combine noise data for April together
# noise_apr_combined = pd.concat([naamse35_apr, naamse57_apr, naamse62_apr, calvarie_apr, park_apr, naamse81_apr, kiosk_apr, vrijt_apr, his_apr], axis=0)
# noise_apr_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_apr_combined['result_timestamp'] = pd.to_datetime(noise_apr_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_apr_combined['result_date'] = noise_apr_combined['result_timestamp'].dt.date
# noise_apr_combined['result_month'] = noise_apr_combined['result_timestamp'].dt.month
# noise_apr_combined['result_day'] = noise_apr_combined['result_timestamp'].dt.day
# noise_apr_combined['result_hour'] = noise_apr_combined['result_timestamp'].dt.hour
# noise_apr_combined['result_minute'] = noise_apr_combined['result_timestamp'].dt.minute

# noise_apr_combined.head()

In [None]:
# # aggregate the data by day
# avg_apr_combined = noise_apr_combined.groupby(['result_date','description']).mean()
# avg_apr_combined = avg_apr_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_apr_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_apr_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_apr_combined.isnull().sum())

- May 

In [None]:
# # combine noise data for May together
# noise_may_combined = pd.concat([naamse35_may, naamse57_may, naamse62_may, calvarie_may, park_may, naamse81_may, kiosk_may, vrijt_may, his_may], axis=0)
# noise_may_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_may_combined['result_timestamp'] = pd.to_datetime(noise_may_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_may_combined['result_date'] = noise_may_combined['result_timestamp'].dt.date
# noise_may_combined['result_month'] = noise_may_combined['result_timestamp'].dt.month
# noise_may_combined['result_day'] = noise_may_combined['result_timestamp'].dt.day
# noise_may_combined['result_hour'] = noise_may_combined['result_timestamp'].dt.hour
# noise_may_combined['result_minute'] = noise_may_combined['result_timestamp'].dt.minute

# noise_may_combined.head()

In [None]:
# # aggregate the data by day
# avg_may_combined = noise_may_combined.groupby(['result_date','description']).mean()
# avg_may_combined = avg_may_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_may_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_may_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_may_combined.isnull().sum())

- June

In [None]:
# # combine noise data for June together
# noise_jun_combined = pd.concat([naamse35_jun, naamse57_jun, naamse62_jun, calvarie_jun, park_jun, naamse81_jun, kiosk_jun, vrijt_jun, his_jun], axis=0)
# noise_jun_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_jun_combined['result_timestamp'] = pd.to_datetime(noise_jun_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_jun_combined['result_date'] = noise_jun_combined['result_timestamp'].dt.date
# noise_jun_combined['result_month'] = noise_jun_combined['result_timestamp'].dt.month
# noise_jun_combined['result_day'] = noise_jun_combined['result_timestamp'].dt.day
# noise_jun_combined['result_hour'] = noise_jun_combined['result_timestamp'].dt.hour
# noise_jun_combined['result_minute'] = noise_jun_combined['result_timestamp'].dt.minute

# noise_jun_combined.head()

In [None]:
# aggregate the data by day
# avg_jun_combined = noise_jun_combined.groupby(['result_date','description']).mean()
# avg_jun_combined = avg_jun_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_jun_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_jun_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_jun_combined.isnull().sum())

- July

In [None]:
# # combine noise data for July together
# noise_jul_combined = pd.concat([naamse35_jul, naamse57_jul, naamse62_jul, calvarie_jul, park_jul, naamse81_jul, kiosk_jul, vrijt_jul, his_jul], axis=0)
# noise_jul_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_jul_combined['result_timestamp'] = pd.to_datetime(noise_jul_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_jul_combined['result_date'] = noise_jul_combined['result_timestamp'].dt.date
# noise_jul_combined['result_month'] = noise_jul_combined['result_timestamp'].dt.month
# noise_jul_combined['result_day'] = noise_jul_combined['result_timestamp'].dt.day
# noise_jul_combined['result_hour'] = noise_jul_combined['result_timestamp'].dt.hour
# noise_jul_combined['result_minute'] = noise_jul_combined['result_timestamp'].dt.minute

# noise_jul_combined.head()

In [None]:
# # aggregate the data by day
# avg_jul_combined = noise_jul_combined.groupby(['result_date','description']).mean()
# avg_jul_combined = avg_jul_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_jul_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_jul_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_jul_combined.isnull().sum())

- August

In [None]:
# # combine noise data for August together
# noise_aug_combined = pd.concat([naamse35_aug, naamse57_aug, naamse62_aug, calvarie_aug, park_aug, naamse81_aug, kiosk_aug, vrijt_aug, his_aug], axis=0)
# noise_aug_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_aug_combined['result_timestamp'] = pd.to_datetime(noise_aug_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_aug_combined['result_date'] = noise_aug_combined['result_timestamp'].dt.date
# noise_aug_combined['result_month'] = noise_aug_combined['result_timestamp'].dt.month
# noise_aug_combined['result_day'] = noise_aug_combined['result_timestamp'].dt.day
# noise_aug_combined['result_hour'] = noise_aug_combined['result_timestamp'].dt.hour
# noise_aug_combined['result_minute'] = noise_aug_combined['result_timestamp'].dt.minute

# noise_aug_combined.head()

In [None]:
# # aggregate the data by day
# avg_aug_combined = noise_aug_combined.groupby(['result_date','description']).mean()
# avg_aug_combined = avg_aug_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_aug_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_aug_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_aug_combined.isnull().sum())

- September

In [None]:
# # combine noise data for September together
# noise_sep_combined = pd.concat([naamse35_sep, naamse57_sep, naamse62_sep, calvarie_sep, park_sep, naamse81_sep, kiosk_sep, vrijt_sep, his_sep], axis=0)
# noise_sep_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_sep_combined['result_timestamp'] = pd.to_datetime(noise_sep_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_sep_combined['result_date'] = noise_sep_combined['result_timestamp'].dt.date
# noise_sep_combined['result_month'] = noise_sep_combined['result_timestamp'].dt.month
# noise_sep_combined['result_day'] = noise_sep_combined['result_timestamp'].dt.day
# noise_sep_combined['result_hour'] = noise_sep_combined['result_timestamp'].dt.hour
# noise_sep_combined['result_minute'] = noise_sep_combined['result_timestamp'].dt.minute

# noise_sep_combined.head()

In [None]:
# # aggregate the data by day
# avg_sep_combined = noise_sep_combined.groupby(['result_date','description']).mean()
# avg_sep_combined = avg_sep_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_sep_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_sep_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_sep_combined.isnull().sum())

- October

In [None]:
# # combine noise data for Octber together
# noise_oct_combined = pd.concat([naamse35_oct, naamse57_oct, naamse62_oct, calvarie_oct, park_oct, naamse81_oct, kiosk_oct, vrijt_oct, his_oct], axis=0)
# noise_oct_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_oct_combined['result_timestamp'] = pd.to_datetime(noise_oct_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_oct_combined['result_date'] = noise_oct_combined['result_timestamp'].dt.date
# noise_oct_combined['result_month'] = noise_oct_combined['result_timestamp'].dt.month
# noise_oct_combined['result_day'] = noise_oct_combined['result_timestamp'].dt.day
# noise_oct_combined['result_hour'] = noise_oct_combined['result_timestamp'].dt.hour
# noise_oct_combined['result_minute'] = noise_oct_combined['result_timestamp'].dt.minute

# noise_oct_combined.head()

In [None]:
# # aggregate the data by day
# avg_oct_combined = noise_oct_combined.groupby(['result_date','description']).mean()
# avg_oct_combined = avg_oct_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_oct_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_oct_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_oct_combined.isnull().sum())

- November

In [None]:
# # combine noise data for November together
# noise_nov_combined = pd.concat([naamse35_nov, naamse57_nov, naamse62_nov, calvarie_nov, park_nov, naamse81_nov, kiosk_nov, vrijt_nov, his_nov], axis=0)
# noise_nov_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_nov_combined['result_timestamp'] = pd.to_datetime(noise_nov_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_nov_combined['result_date'] = noise_nov_combined['result_timestamp'].dt.date
# noise_nov_combined['result_month'] = noise_nov_combined['result_timestamp'].dt.month
# noise_nov_combined['result_day'] = noise_nov_combined['result_timestamp'].dt.day
# noise_nov_combined['result_hour'] = noise_nov_combined['result_timestamp'].dt.hour
# noise_nov_combined['result_minute'] = noise_nov_combined['result_timestamp'].dt.minute

# noise_nov_combined.head()

In [None]:
# # aggregate the data by day
# avg_nov_combined = noise_nov_combined.groupby(['result_date','description']).mean()
# avg_nov_combined = avg_nov_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_nov_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_nov_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_nov_combined.isnull().sum())

- December

In [None]:
# # combine noise data for December together
# noise_dec_combined = pd.concat([naamse35_dec, naamse57_dec, naamse62_dec, calvarie_dec, park_dec, naamse81_dec, kiosk_dec, vrijt_dec, his_dec], axis=0)
# noise_dec_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_dec_combined['result_timestamp'] = pd.to_datetime(noise_dec_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_dec_combined['result_date'] = noise_dec_combined['result_timestamp'].dt.date
# noise_dec_combined['result_month'] = noise_dec_combined['result_timestamp'].dt.month
# noise_dec_combined['result_day'] = noise_dec_combined['result_timestamp'].dt.day
# noise_dec_combined['result_hour'] = noise_dec_combined['result_timestamp'].dt.hour
# noise_dec_combined['result_minute'] = noise_dec_combined['result_timestamp'].dt.minute

# noise_dec_combined.head()

In [None]:
# # aggregate the data by day
# avg_dec_combined = noise_dec_combined.groupby(['result_date','description']).mean()
# avg_dec_combined = avg_dec_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_dec_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_dec_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_dec_combined.isnull().sum())

Combining monthly noise level datasets into a yearly dataset

In [None]:
# List of the monthly datasets
datasets = [avg_jan_combined, avg_feb_combined, avg_mar_combined, avg_apr_combined, avg_may_combined, avg_jun_combined, avg_jul_combined, avg_aug_combined, avg_sep_combined, avg_oct_combined, avg_nov_combined, avg_dec_combined]

# Concatenate the datasets vertically
avg_year_combined = pd.concat(datasets, ignore_index=True)

# Sort the combined dataset by 'result_date' in ascending order
avg_year_combined.sort_values(by='result_date', inplace=True)

# Reset the index of the combined dataset
avg_year_combined.reset_index(drop=True, inplace=True)

# Display the combined and sorted yearly dataset
avg_year_combined.head(2000)
