# Preprocessing the data

### Loading required packages

In [1]:
import pandas as pd
import numpy as np
import boto3
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
import pytz

## Loading and combining data
### Meteo data

In [36]:
## Building preprocessing pipeline
# Step 1: Concatenate datasets
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Convert UTC to CEST
def convert_utc_to_cest(df):
    cest_timezone = pytz.timezone('Europe/Brussels')
    df['DATEUTC'] = df['DATEUTC'].apply(lambda x: pytz.utc.localize(x).astimezone(cest_timezone))
    return df

# Step 2: Drop columns
def drop_columns(df):
    columns_to_keep = ['LC_RAININ', 'LC_DAILYRAIN', 'LC_WINDDIR', 'LC_WINDSPEED', 'LC_TEMP_QCL3', 'Month', 'Day', 'Hour']  #there's less columns we keep than drop
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Step 3: Check for percentage of missing values in each column
def print_null_percentage(df):
    null_percentage = df.isnull().sum() / len(df)
    print('The percentage of missing values in each column')
    print(null_percentage)
    return df

# Step 4: Forward fill missing values
def forward_fill(df):
    return df.ffill()

# Step 5: Check whether there are missing values left
def check_missing_values(df):
    missing_values = df.isnull().sum()
    print('Check whether there are missing values left')
    print(missing_values)
    return df

# Step 6: Calculate summary statistics for daily rain sum
def daily_rain_sum(df):
    summary_stats = df['LC_DAILYRAIN'].describe()
    print('Summary statistics for daily rain sum')
    print(summary_stats)
    return df

# Step 7: Calculate fraction of non-zero values in the 'LC_DAILYRAIN' column
def non_zero_fraction(df):
    nonzero_count = np.count_nonzero(df['LC_DAILYRAIN'])
    non_zero_frac = nonzero_count/len(df)
    print("Fraction of non-zero values:", non_zero_frac)
    return df

# Define the pipeline
pipeline_meteo = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('convert_utc_to_cest', FunctionTransformer(convert_utc_to_cest)),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('print_null_percentage', FunctionTransformer(print_null_percentage)),
    ('forward_fill', FunctionTransformer(forward_fill)),
    ('check_missing_values', FunctionTransformer(check_missing_values)),
    ('daily_rain_sum', FunctionTransformer(daily_rain_sum)),
    ('non_zero_fraction', FunctionTransformer(non_zero_fraction))
])

In [32]:
# loading meteo data
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
years = ['2022']
base_url_meteo = 'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_{}{}.csv'

dfs = []

for year in years:
    for quarter in quarters:
        url = base_url_meteo.format(year, quarter)
        df = pd.read_csv(url)
        dfs.append(df)

In [38]:
# Apply the pipeline
meteo_combined_df = pipeline_meteo.fit_transform(dfs)
meteo_combined_df.head()

The percentage of missing values in each column
LC_RAININ       0.056770
LC_DAILYRAIN    0.056770
LC_WINDDIR      0.056770
LC_WINDSPEED    0.056770
Month           0.000000
Day             0.000000
Hour            0.000000
LC_TEMP_QCL3    0.062285
dtype: float64
Check whether there are missing values left
LC_RAININ       0
LC_DAILYRAIN    0
LC_WINDDIR      0
LC_WINDSPEED    0
Month           0
Day             0
Hour            0
LC_TEMP_QCL3    0
dtype: int64
Summary statistics for daily rain sum
count    5.546880e+06
mean     1.319783e-03
std      6.177559e-03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.540000e-01
Name: LC_DAILYRAIN, dtype: float64
Fraction of non-zero values: 0.17391488548517364


Unnamed: 0,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,Month,Day,Hour,LC_TEMP_QCL3
0,0.0,0.0,-169.0,0.43,1,1,0,13.048027
1,0.0,0.0,-170.0,0.33,1,1,0,12.985849
2,0.0,0.0,-167.0,0.46,1,1,0,12.950322
3,0.0,0.0,-160.0,0.52,1,1,0,12.94955
4,0.0,0.0,-166.0,0.51,1,1,0,12.952268


In [None]:
'''
# Concatenate all the dataframes into a single dataframe
meteo_combined_df = pd.concat(dfs, ignore_index=True)
meteo_combined_df.head()

del dfs # deleting the individual meteo datasets

# Drop the columns we won't use
columns_to_keep = ['LC_RAININ', 'LC_DAILYRAIN', 'LC_WINDDIR', 'LC_WINDSPEED', 'LC_TEMP_QCL3', 'Month', 'Day', 'Hour']  #there's less columns we keep than drop
columns_to_drop = set(meteo_combined_df.columns) - set(columns_to_keep)
meteo_combined_df.drop(columns=columns_to_drop, inplace=True)
meteo_combined_df.head()

# check for percentage of missing values in each column
print(meteo_combined_df.isnull().sum() / len(meteo_combined_df))

# Forward fill missing values in the original DataFrame
meteo_combined_df.ffill(inplace=True)

# check whether there are missing values left
print(meteo_combined_df.isnull().sum())

# calculate summary statistics for daily rain sum
summary_stats = meteo_combined_df['LC_DAILYRAIN'].describe()
print(summary_stats)

# Count the number of non-zero values in the 'LC_DAILYRAIN' column
nonzero_count = np.count_nonzero(meteo_combined_df['LC_DAILYRAIN'])

# Display the fraction of non-zero values
print("Fraction of non-zero values:", nonzero_count/len(meteo_combined_df))
'''

In [39]:
# Create dataframe per hour

# Specify the aggregation function for each column
  # for LC_DAILYRAIN we take the last value because it's cumulative, for other columns the mean
aggregations = {
    'LC_DAILYRAIN': 'mean',  # Select the last value for 'LC_DAILYRAIN' ###TAKE MEAN FOR NOW TO MAKE THE GRAPHS LOOK OK
    'LC_RAININ': 'mean',  
    'LC_WINDDIR': 'mean',
    'LC_WINDDIR': 'mean', 
    'LC_WINDSPEED': 'mean', 
    'LC_TEMP_QCL3': 'mean'
}

# Perform the groupby aggregation
meteo_per_hour = meteo_combined_df.groupby(['Month', 'Day', 'Hour']).mean()
meteo_per_hour = meteo_per_hour.reset_index()
meteo_per_hour.head()

Unnamed: 0,Month,Day,Hour,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_TEMP_QCL3
0,1,1,0,3e-06,0.00036,-18.197324,0.389565,13.100358
1,1,1,1,7e-06,0.0,-16.227891,0.222602,12.669197
2,1,1,2,9e-06,0.0,-13.710884,0.217194,12.520271
3,1,1,3,0.0,0.0,-16.401361,0.178248,12.386194
4,1,1,4,0.0,0.0,-10.268707,0.23767,12.080706


In [40]:
# Create dataframe per day

# still the same "aggregations" as before
meteo_per_day = meteo_combined_df.groupby(['Month', 'Day']).mean()
meteo_per_day = meteo_per_day.reset_index()
meteo_per_day.head()

Unnamed: 0,Month,Day,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,Hour,LC_TEMP_QCL3
0,1,1,2e-06,4.3e-05,-6.263419,0.324702,11.491857,12.194145
1,1,2,0.000671,0.003387,-28.223994,0.707815,11.5,11.893159
2,1,3,0.000729,0.006724,-34.624291,0.65043,11.5,9.53962
3,1,4,0.000454,0.003578,-25.189413,0.326204,11.5,6.768886
4,1,5,0.000127,0.000255,-43.943098,0.602983,11.5,3.889968


In [41]:
# Create dataframe per month

# still the same "aggregations" as before
meteo_per_month = meteo_combined_df.groupby(['Month']).mean()
meteo_per_month = meteo_per_month.reset_index()
meteo_per_month.head()

Unnamed: 0,Month,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,Day,Hour,LC_TEMP_QCL3
0,1,0.000112,0.001033,-16.334089,0.336535,15.999657,11.499737,4.698144
1,2,0.000131,0.001257,-25.171683,0.740747,14.5,11.5,6.923138
2,3,1e-05,0.000105,12.190264,0.250732,16.0,11.5,8.113201
3,4,5.4e-05,0.000503,3.643447,0.369271,15.500311,11.500246,10.704171
4,5,7.6e-05,0.000653,-9.02224,0.240475,16.0,11.5,15.572697


In [42]:
# delete combined dataframe
del dfs
del meteo_combined_df

In [30]:
# export dataframes (only needs to be ran once so comment it out)
#meteo_per_hour.to_csv('hourly_weatherdata_2022.csv', index=False)
#meteo_per_day.to_csv('daily_weatherdata_2022.csv', index=False)
#meteo_per_month.to_csv('monthly_weatherdata_2022.csv', index=False)

### Noise level data

#### Building the pipelines

In [None]:
## Modelling data pipeline
# Step 1: Concatenate datasets
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Convert timestamps to datetime
def convert_to_datetime(df):
    df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
    return df

# Step 3: Extract month, day, hour, minute from timestamps
def extract_month(df):
    df['month'] = df['result_timestamp'].dt.month
    return df
def extract_day(df):
    df['day'] = df['result_timestamp'].dt.day
    return df
def extract_hour(df):
    df['hour'] = df['result_timestamp'].dt.hour
    return df
#def extract_minute(df):
#   df['minute'] = df['result_timestamp'].dt.minute
#   return df

# Step 4: Drop columns
def drop_columns(df):
    columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Define the pipeline
pipeline_modelling = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('convert_to_datetime', FunctionTransformer(convert_to_datetime)),
    ('extract_month', FunctionTransformer(extract_month)),
    ('extract_day', FunctionTransformer(extract_day)),
    ('extract_hour', FunctionTransformer(extract_hour)),
    ('drop_columns', FunctionTransformer(drop_columns))
])

In [2]:
## Overall dataset preprocessing pipeline
# Step 1: Concatenate datasets
def concatenate_datasets(dfs):
    return pd.concat(dfs, ignore_index=True)

# Step 2: Convert timestamps to datetime
def convert_to_datetime(df):
    df['result_timestamp'] = pd.to_datetime(df['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
    return df

# Step 3: Extract month, day, hour, minute from timestamps
def extract_month(df):
    df['month'] = df['result_timestamp'].dt.month
    return df
def extract_day(df):
    df['day'] = df['result_timestamp'].dt.day
    return df
def extract_hour(df):
    df['hour'] = df['result_timestamp'].dt.hour
    return df
#def extract_minute(df):
#   df['minute'] = df['result_timestamp'].dt.minute
#   return df

# Step 4: Drop columns
def drop_columns(df):
    columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
    columns_to_drop = set(df.columns) - set(columns_to_keep)
    return df.drop(columns=columns_to_drop)

# Step 5: Forward fill missing values
def forward_fill(df):
    return df.ffill()

# Define the pipeline
pipeline_general = Pipeline([
    ('concatenate_datasets', FunctionTransformer(concatenate_datasets)),
    ('convert_to_datetime', FunctionTransformer(convert_to_datetime)),
    ('extract_month', FunctionTransformer(extract_month)),
    ('extract_day', FunctionTransformer(extract_day)),
    ('extract_hour', FunctionTransformer(extract_hour)),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('forward_fill', FunctionTransformer(forward_fill))
])

In [3]:
## Hourly pipeline
# Step 6: Perform groupby to create dataframe per hour
def perform_groupby(df):
    return df.groupby(['month', 'day', 'hour', 'description']).mean()

# Step 7: Reset index
def reset_index_func(df):
    return df.reset_index()

# Step 8: Standardize the data
def standardize_columns(df, columns_to_standardize):
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    new_columns = [column + '_standardized' for column in columns_to_standardize]
    df[new_columns] = pd.DataFrame(standardized_values, columns=new_columns)
    return df

# Step 9: Define a custom transformer to create the new column
class DateTransformer:
    def transform(self, df):
        df['year'] = 2022
        df['date'] = df.apply(lambda row: pd.to_datetime(f"{int(row['day']):02d}-{int(row['month']):02d}-{int(row['year']):04d}-{int(row['hour']):02d}", format='%d-%m-%Y-%H'), axis=1)
        df['date'] = df['date'].dt.strftime('%H:%M %d-%m-%Y')
        return df

    def fit(self, df, y=None):
        return self
    
# Step 10: Drop the year column
def drop_year_column(df):
    return df.drop(columns='year')

# Define the pipeline
pipeline_hourly = Pipeline([
    ('groupby', FunctionTransformer(perform_groupby)),
    ('reset_index', FunctionTransformer(reset_index_func)),
    ('standardize_columns', FunctionTransformer(standardize_columns, kw_args={'columns_to_standardize': ['lamax', 'laeq']})),
      ('date_transformer', DateTransformer()),
    ('drop_year_column', FunctionTransformer(drop_year_column))
])

In [4]:
## Daily pipeline
# Step 6: Perform groupby to create dataframe per hour
def perform_groupby(df):
    return df.groupby(['month', 'day', 'description']).mean()

# Step 7: Reset index
def reset_index_func(df):
    return df.reset_index()

# Step 8: Standardize the data
def standardize_columns(df, columns_to_standardize):
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    new_columns = [column + '_standardized' for column in columns_to_standardize]
    df[new_columns] = pd.DataFrame(standardized_values, columns=new_columns)
    return df

# Step 9: Drop unwanted columns
def drop_columns(df):
    return df.drop(columns='hour')

# Step 10: Define a custom transformer to create the new column
class DateTransformer:
    def transform(self, df):
        df['year'] = 2022
        df['date'] = df.apply(lambda row: pd.to_datetime(f"{int(row['day']):02d}-{int(row['month']):02d}-{int(row['year']):04d}", format='%d-%m-%Y'), axis=1)
        df['date'] = df['date'].dt.strftime('%d-%m-%Y')
        return df

    def fit(self, df, y=None):
        return self
    
# Step 11: Drop the year column
def drop_year_column(df):
    return df.drop(columns='year')

# Define the pipeline
pipeline_daily = Pipeline([
    ('groupby', FunctionTransformer(perform_groupby)),
    ('reset_index', FunctionTransformer(reset_index_func)),
    ('standardize_columns', FunctionTransformer(standardize_columns, kw_args={'columns_to_standardize': ['lamax', 'laeq']})),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('date_transformer', DateTransformer()),
    ('drop_year_column', FunctionTransformer(drop_year_column))
])

In [5]:
## Monthly pipeline
# Step 6: Perform groupby to create dataframe per hour
def perform_groupby(df):
    return df.groupby(['month', 'description']).mean()

# Step 7: Reset index
def reset_index_func(df):
    return df.reset_index()

# Step 8: Standardize the data
def standardize_columns(df, columns_to_standardize):
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    new_columns = [column + '_standardized' for column in columns_to_standardize]
    df[new_columns] = pd.DataFrame(standardized_values, columns=new_columns)
    return df

# Step 9: Drop unwanted columns
def drop_columns(df):
    columns_to_drop = ['day', 'hour']
    return df.drop(columns=columns_to_drop)

# Step 10: Define a custom transformer to create the new column
class DateTransformer:
    def transform(self, df):
        df['year'] = 2022
        df['date'] = df.apply(lambda row: pd.to_datetime(f"{int(row['month']):02d}-{int(row['year']):04d}", format='%m-%Y'), axis=1)
        df['date'] = df['date'].dt.strftime('%b %Y')
        return df

    def fit(self, df, y=None):
        return self
    
# Step 11: Drop the year column
def drop_year_column(df):
    return df.drop(columns='year')

# Define the pipeline
pipeline_monthly = Pipeline([
    ('groupby', FunctionTransformer(perform_groupby)),
    ('reset_index', FunctionTransformer(reset_index_func)),
    ('standardize_columns', FunctionTransformer(standardize_columns, kw_args={'columns_to_standardize': ['lamax', 'laeq']})),
    ('drop_columns', FunctionTransformer(drop_columns)),
    ('date_transformer', DateTransformer()),
    ('drop_year_column', FunctionTransformer(drop_year_column))
])

#### Loading datasets

- January 

In [6]:
# Define a list of URLs
urls_jan = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_jan = []

# Loop through each URL and read the CSV into a DataFrame
for url_jan in urls_jan:
    df_jan = pd.read_csv(url_jan, header=0, sep=';')
    dfs_jan.append(df_jan)

# Now we have a list of DataFrames for each URL called dfs_jan

- February

In [7]:
# Define a list of URLs 
urls_feb = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_feb = []

# Loop through each URL and read the CSV into a DataFrame
for url_feb in urls_feb:
    df_feb = pd.read_csv(url_feb, header=0, sep=';')
    dfs_feb.append(df_feb)

# Now we have a list of DataFrames for each URL called dfs_feb

- March

In [8]:
# Define a list of URLs 
urls_mar = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_mar = []

# Loop through each URL and read the CSV into a DataFrame
for url_mar in urls_mar:
    df_mar = pd.read_csv(url_mar, header=0, sep=';')
    dfs_mar.append(df_mar)

- April

In [9]:
# Define a list of URLs 
urls_apr = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_apr = []

# Loop through each URL and read the CSV into a DataFrame
for url_apr in urls_apr:
    df_apr = pd.read_csv(url_apr, header=0, sep=';')
    dfs_apr.append(df_apr)

- May

In [10]:
# Define a list of URLs 
urls_may = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_may = []

# Loop through each URL and read the CSV into a DataFrame
for url_may in urls_may:
    df_may = pd.read_csv(url_may, header=0, sep=';')
    dfs_may.append(df_may)


- June

In [11]:
# Define a list of URLs 
url_jun = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_jun = []

# Loop through each URL and read the CSV into a DataFrame
for url_jun in url_jun:
    df_jun = pd.read_csv(url_jun, header=0, sep=';')
    dfs_jun.append(df_jun)

- July

In [12]:
# Define a list of URLs 
urls_jul = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_jul = []

# Loop through each URL and read the CSV into a DataFrame
for url_jul in urls_jul:
    df_jul = pd.read_csv(url_jul, header=0, sep=';')
    dfs_jul.append(df_jul)

- August

In [13]:
# Define a list of URLs 
urls_aug = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_aug = []

# Loop through each URL and read the CSV into a DataFrame
for url_aug in urls_aug:
    df_aug = pd.read_csv(url_aug, header=0, sep=';')
    dfs_aug.append(df_aug)

- September

In [14]:
# Define a list of URLs 
urls_sep = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_sep = []

# Loop through each URL and read the CSV into a DataFrame
for url_sep in urls_sep:
    df_sep = pd.read_csv(url_sep, header=0, sep=';')
    dfs_sep.append(df_sep)

- October

In [15]:
# Define a list of URLs 
urls_oct = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_oct = []

# Loop through each URL and read the CSV into a DataFrame
for url_oct in urls_oct:
    df_oct = pd.read_csv(url_oct, header=0, sep=';')
    dfs_oct.append(df_oct)

- November

In [16]:
# Define a list of URLs 
urls_nov = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_nov = []

# Loop through each URL and read the CSV into a DataFrame
for url_nov in urls_nov:
    df_nov = pd.read_csv(url_nov, header=0, sep=';')
    dfs_nov.append(df_nov)

- December

In [17]:
# Define a list of URLs 
urls_dec = [
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255439_mp-01-naamsestraat-35-maxim.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255440_mp-02-naamsestraat-57-xior.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255441_mp-03-naamsestraat-62-taste.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255442_mp-05-calvariekapel-ku-leuven.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255444_mp-07-naamsestraat-81.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255445_mp-08-kiosk-stadspark.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_280324_mp08bis---vrijthof.csv',
    'https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_303910_mp-04-his-hears.csv'
   ]

# Create an empty list to store the DataFrames
dfs_dec = []

# Loop through each URL and read the CSV into a DataFrame
for url_dec in urls_dec:
    df_dec = pd.read_csv(url_dec, header=0, sep=';')
    dfs_dec.append(df_dec)

#### Apply the pipelines

In [19]:
# List of datasets
dfs_2022 = [dfs_jan,dfs_feb,dfs_mar,dfs_apr,dfs_may,dfs_jun,dfs_jul,dfs_aug,dfs_sep,dfs_oct,dfs_nov,dfs_dec]

In [20]:
########### in case kernal crashes again
jan = pd.concat(dfs_jan, ignore_index=True)
feb = pd.concat(dfs_feb, ignore_index=True)
mar = pd.concat(dfs_mar, ignore_index=True)
apr = pd.concat(dfs_apr, ignore_index=True)
may = pd.concat(dfs_may, ignore_index=True)
jun = pd.concat(dfs_jun, ignore_index=True)
jul = pd.concat(dfs_jul, ignore_index=True)
aug = pd.concat(dfs_aug, ignore_index=True)
sep = pd.concat(dfs_sep, ignore_index=True)
oct = pd.concat(dfs_oct, ignore_index=True)
nov = pd.concat(dfs_nov, ignore_index=True)
dec = pd.concat(dfs_dec, ignore_index=True)

In [None]:
#############
dfs_noise = [jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec]
combined_noise = pd.concat(dfs_noise, ignore_index=True)
combined_noise.head(100000)

In [None]:
'''
## Create a full dataframe for later use
# Create an empty list to store the DataFrames
dfs_noise = []

# loop through each list and oncatenate noise data in each month
for df_2022 in dfs_2022:
    df_noise = pd.concat(df_2022, ignore_index=True)
    dfs_noise.append(df_noise)
'''

In [None]:
'''
combined_noise = pd.concat(dfs_noise, ignore_index=True)
combined_noise.head(100000)
'''

In [None]:
## To get data for modelling
data_modelling = pipeline_modelling(combined_noise)

In [21]:
## To get hourly, daily, monthly data
# Apply the pipeline to the overall dataset
transformed_overall_datasets = []
for df_2022 in dfs_2022:
    transformed_overall_dataset = pipeline_general.fit_transform(df_2022)
    transformed_overall_datasets.append(transformed_overall_dataset)

combined = pd.concat(transformed_overall_datasets, ignore_index=True)
combined.head(100000)

Unnamed: 0,description,lamax,laeq,month,day,hour
0,MP 03: Naamsestraat 62 Taste,87.6,82.7,1,1,0
1,MP 03: Naamsestraat 62 Taste,84.5,83.1,1,1,0
2,MP 03: Naamsestraat 62 Taste,84.8,82.7,1,1,0
3,MP 03: Naamsestraat 62 Taste,81.9,79.3,1,1,0
4,MP 03: Naamsestraat 62 Taste,78.3,76.0,1,1,0
...,...,...,...,...,...,...
99995,MP 03: Naamsestraat 62 Taste,45.9,44.7,1,2,3
99996,MP 03: Naamsestraat 62 Taste,46.2,45.4,1,2,3
99997,MP 03: Naamsestraat 62 Taste,46.0,44.6,1,2,3
99998,MP 03: Naamsestraat 62 Taste,45.5,44.7,1,2,3


In [46]:
# Apply the pipeline to the combined and collect hourly data
combined_hourly = pipeline_hourly.fit_transform(combined)

combined_hourly.head(100000)

Unnamed: 0,month,day,hour,description,lamax,laeq,lamax_standardized,laeq_standardized,date
0,1,1,0,MP 03: Naamsestraat 62 Taste,60.322528,57.126833,1.248969,1.044063,00:00 01-01-2022
1,1,1,0,MP 05: Calvariekapel KU Leuven,53.230972,49.987639,0.114661,-0.103638,00:00 01-01-2022
2,1,1,0,MP 06: Parkstraat 2 La Filosovia,53.666056,50.752000,0.184253,0.019241,00:00 01-01-2022
3,1,1,0,MP 07: Naamsestraat 81,50.056861,47.440222,-0.393044,-0.513162,00:00 01-01-2022
4,1,1,1,MP 03: Naamsestraat 62 Taste,53.033583,50.853806,0.083088,0.035608,01:00 01-01-2022
...,...,...,...,...,...,...,...,...,...
55464,12,31,23,MP 04: His & Hears,61.517005,58.995888,1.440028,1.344533,23:00 31-12-2022
55465,12,31,23,MP 05: Calvariekapel KU Leuven,59.659572,56.671964,1.142928,0.970938,23:00 31-12-2022
55466,12,31,23,MP 06: Parkstraat 2 La Filosovia,57.888775,55.366713,0.859685,0.761105,23:00 31-12-2022
55467,12,31,23,MP 07: Naamsestraat 81,55.636732,53.113476,0.499467,0.398874,23:00 31-12-2022


In [42]:
# Apply the pipeline to the combined and collect daily data
combined_daily = pipeline_daily.fit_transform(combined)

combined_daily.head(100000)

Unnamed: 0,month,day,description,lamax,laeq,lamax_standardized,laeq_standardized,date
0,1,1,MP 03: Naamsestraat 62 Taste,51.665242,49.992637,-0.247251,-0.193655,01-01-2022
1,1,1,MP 05: Calvariekapel KU Leuven,48.747476,46.504067,-1.093428,-1.245260,01-01-2022
2,1,1,MP 06: Parkstraat 2 La Filosovia,48.270005,46.007220,-1.231899,-1.395031,01-01-2022
3,1,1,MP 07: Naamsestraat 81,45.908501,44.373056,-1.916755,-1.887638,01-01-2022
4,1,2,MP 03: Naamsestraat 62 Taste,51.407297,50.094018,-0.322058,-0.163094,02-01-2022
...,...,...,...,...,...,...,...,...
2317,12,31,MP 04: His & Hears,55.134973,53.632457,0.759000,0.903544,31-12-2022
2318,12,31,MP 05: Calvariekapel KU Leuven,51.925297,50.133166,-0.171833,-0.151293,31-12-2022
2319,12,31,MP 06: Parkstraat 2 La Filosovia,50.342353,48.683298,-0.630900,-0.588346,31-12-2022
2320,12,31,MP 07: Naamsestraat 81,49.665543,47.970367,-0.827181,-0.803254,31-12-2022


In [40]:
# Apply the pipeline to the combined and collect monthly data
combined_monthly = pipeline_monthly.fit_transform(combined)

combined_monthly.head(100000)

Unnamed: 0,month,description,lamax,laeq,lamax_standardized,laeq_standardized,date
0,1,MP 03: Naamsestraat 62 Taste,53.239009,51.727544,0.238516,0.391739,Jan 2022
1,1,MP 05: Calvariekapel KU Leuven,50.374792,48.560197,-0.780260,-0.796668,Jan 2022
2,1,MP 06: Parkstraat 2 La Filosovia,50.086348,48.274795,-0.882857,-0.903753,Jan 2022
3,1,MP 07: Naamsestraat 81,48.800443,47.515371,-1.340242,-1.188693,Jan 2022
4,2,MP 01: Naamsestraat 35 Maxim,57.287668,55.245502,1.678588,1.711697,Feb 2022
...,...,...,...,...,...,...,...
77,12,MP 04: His & Hears,54.776651,53.133224,0.785442,0.919158,Dec 2022
78,12,MP 05: Calvariekapel KU Leuven,52.407326,50.394638,-0.057306,-0.108375,Dec 2022
79,12,MP 06: Parkstraat 2 La Filosovia,51.694696,49.915926,-0.310782,-0.287991,Dec 2022
80,12,MP 07: Naamsestraat 81,50.699741,49.103125,-0.664678,-0.592958,Dec 2022


In [47]:
# check whether there are missing values left
print(combined_hourly.isnull().sum())
print(combined_daily.isnull().sum())
print(combined_monthly.isnull().sum())

month                 0
day                   0
hour                  0
description           0
lamax                 0
laeq                  0
lamax_standardized    0
laeq_standardized     0
date                  0
dtype: int64
month                 0
day                   0
description           0
lamax                 0
laeq                  0
lamax_standardized    0
laeq_standardized     0
date                  0
dtype: int64
month                 0
description           0
lamax                 0
laeq                  0
lamax_standardized    0
laeq_standardized     0
date                  0
dtype: int64


In [49]:
# exporting file (only needs to be run one time so comment it out)
combined_hourly.to_csv('hourly_noisedata_2022.csv', index=False)  
combined_daily.to_csv('daily_noisedata_2022.csv', index=False) 
combined_monthly.to_csv('monthly_noisedata_2022.csv', index=False) 

In [None]:
# Apply the pipeline to get modelling dataset
data_modelling = pipeline_modelling.fit_transform(combined)

In [15]:
# Delete the separate dataframes to minimize memory usage
del combined_hourly
del combined_daily
del combined_monthly

NameError: name 'combined_hourly' is not defined

### Noise events

In [50]:
# Load the events data
event_mp01 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv',delimiter=';')
event_mp02 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255440_mp-02-naamsestraat-57-xior.csv',delimiter=';')
event_mp03 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255441_mp-03-naamsestraat-62-taste.csv',delimiter=';')
event_mp04 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_303910_mp-04-his-hears.csv',delimiter=';')
event_mp05 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255442_mp-05-calvariekapel-ku-leuven.csv',delimiter=';')
event_mp06 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255443_mp-06-parkstraat-2-la-filosovia.csv',delimiter=';')
event_mp07 = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255444_mp-07-naamsestraat-81.csv',delimiter=';')
event_mp08stadspark = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_255445_mp-08-kiosk-stadspark.csv',delimiter=';')
event_mp08Vrijthof = pd.read_csv('/Users/tianying/Documents/Statistics and Data Science/Course/Modern Data Analytics/MDA project/MDA-Georgia/Data/csv_results_41_280324_mp08bis---vrijthof.csv',delimiter=';')

In [51]:
# Concatenate the events data
events = [event_mp01,event_mp02,event_mp03,event_mp04,event_mp05,event_mp06,event_mp07,event_mp08Vrijthof]
combined_event = pd.concat(events, ignore_index=True)

combined_event.head()

In [None]:
# Join the event data with noise data
merged_noise_event = pd.merge(combined_noise, combined_event, on=['description','result_timestamp'],  how='left')

#### Old preprocessing

- Jan

In [1]:
# Combining the datasets for January
#combined_jan = pd.concat(dfs, ignore_index=True)
#print(combined_jan.head())

#del dfs # deleting the separate dataframes to minimize memory usage


# extract the month, day, hour, minute of "result_timestamp"
#combined_jan['result_timestamp'] = pd.to_datetime(combined_jan['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_jan['month'] = combined_jan['result_timestamp'].dt.month
#combined_jan['day'] = combined_jan['result_timestamp'].dt.day
#combined_jan['hour'] = combined_jan['result_timestamp'].dt.hour
#combined_jan['minute'] = combined_jan['result_timestamp'].dt.minute

#combined_jan.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_jan.columns) - set(columns_to_keep)
#combined_jan.drop(columns=columns_to_drop, inplace=True)
#combined_jan.head()


# check for missing values in each column
#print(combined_jan.isnull().sum())


# forward fill missing values 
#combined_jan.ffill(inplace=True)

# check whether there are missing values left
#print(combined_jan.isnull().sum())


# Create dataframe per hour
#jan_per_hour = combined_jan.groupby(['month', 'day', 'hour', 'description']).mean()
#jan_per_hour = jan_per_hour.reset_index()
#jan_per_hour.head()


# Create dataframe per day
#combined_jan.drop('hour', axis=1, inplace=True)
#jan_per_day= combined_jan.groupby(['month', 'day', 'description']).mean()
#jan_per_day = jan_per_day.reset_index()
#jan_per_day.head()


# Create dataframe per month
#combined_jan.drop('day', axis=1, inplace=True)
#jan_per_month = combined_jan.groupby(['month', 'description']).mean()
#jan_per_month = jan_per_month.reset_index()
#jan_per_month.head()


#del combined_jan

- Feb

In [2]:
# Combining the datasets for February
#combined_feb = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_feb['result_timestamp'] = pd.to_datetime(combined_feb['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_feb['month'] = combined_feb['result_timestamp'].dt.month
#combined_feb['day'] = combined_feb['result_timestamp'].dt.day
#combined_feb['hour'] = combined_feb['result_timestamp'].dt.hour
#combined_feb['minute'] = combined_feb['result_timestamp'].dt.minute

#combined_feb.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_feb.columns) - set(columns_to_keep)
#combined_feb.drop(columns=columns_to_drop, inplace=True)
#combined_feb.head()


# check for missing values in each column
#print(combined_feb.isnull().sum())


# Create dataframe per hour
#feb_per_hour = combined_feb.groupby(['month', 'day', 'hour', 'description']).mean()
#feb_per_hour = feb_per_hour.reset_index()
#print(feb_per_hour.head())

# Create dataframe per day
#combined_feb.drop('hour', axis=1, inplace=True)
#feb_per_day = combined_feb.groupby(['month', 'day', 'description']).mean()
#feb_per_day = feb_per_day.reset_index()
#print(feb_per_day.head())

# Create dataframe per month
#combined_feb.drop('day', axis=1, inplace=True)
#feb_per_month = combined_feb.groupby(['month', 'description']).mean()
#feb_per_month = feb_per_month.reset_index()
#print(feb_per_month.head())

#del combined_feb

- March

In [20]:
# Combining the datasets for March
#combined_mar = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_mar['result_timestamp'] = pd.to_datetime(combined_mar['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_mar['month'] = combined_mar['result_timestamp'].dt.month
#combined_mar['day'] = combined_mar['result_timestamp'].dt.day
#combined_mar['hour'] = combined_mar['result_timestamp'].dt.hour
#combined_mar['minute'] = combined_mar['result_timestamp'].dt.minute

#combined_mar.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_mar.columns) - set(columns_to_keep)
#combined_mar.drop(columns=columns_to_drop, inplace=True)
#combined_mar.head()


# check for missing values in each column
#print(combined_mar.isnull().sum())


# forward fill missing values 
#combined_mar.ffill(inplace=True)

# check whether there are missing values left
#print(combined_mar.isnull().sum())


# Create dataframe per hour
#mar_per_hour = combined_mar.groupby(['month', 'day', 'hour', 'description']).mean()
#mar_per_hour = mar_per_hour.reset_index()
#print(mar_per_hour.head())

# Create dataframe per day
#combined_mar.drop('hour', axis=1, inplace=True)
#mar_per_day = combined_mar.groupby(['month', 'day', 'description']).mean()
#mar_per_day = mar_per_day.reset_index()
#print(mar_per_day.head())

# Create dataframe per month
#combined_mar.drop('day', axis=1, inplace=True)
#mar_per_month = combined_mar.groupby(['month', 'description']).mean()
#mar_per_month = mar_per_month.reset_index()
#print(mar_per_month.head())

#del combined_mar

Unnamed: 0,#object_id,description,result_timestamp,lamax,lamax_unit,laeq,laeq_unit,lceq,lceq_unit,lcpeak,lcpeak_unit,month,day,hour
0,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:00.462,60.5,dB(A),57.9,dB(A),63.36,dB(C),76.57,dB(C),3,1,0
1,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:01.462,54.1,dB(A),53.2,dB(A),61.86,dB(C),74.52,dB(C),3,1,0
2,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:02.462,61.4,dB(A),57.5,dB(A),64.12,dB(C),76.46,dB(C),3,1,0
3,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:03.462,61.3,dB(A),59.1,dB(A),64.68,dB(C),76.67,dB(C),3,1,0
4,255439,MP 01: Naamsestraat 35 Maxim,2022-03-01 00:00:04.462,61.1,dB(A),58.4,dB(A),64.53,dB(C),77.07,dB(C),3,1,0


- April

In [26]:
# Combining the datasets 
#combined_apr = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_apr['result_timestamp'] = pd.to_datetime(combined_apr['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_apr['month'] = combined_apr['result_timestamp'].dt.month
#combined_apr['day'] = combined_apr['result_timestamp'].dt.day
#combined_apr['hour'] = combined_apr['result_timestamp'].dt.hour
#combined_apr['minute'] = combined_apr['result_timestamp'].dt.minute

#combined_apr.head()


# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_apr.columns) - set(columns_to_keep)
#combined_apr.drop(columns=columns_to_drop, inplace=True)
#combined_apr.head()


# check for missing values in each column
#print(combined_apr.isnull().sum() / len(combined_apr))


# forward fill missing values 
#combined_apr.ffill(inplace=True)

# check whether there are missing values left
#print(combined_apr.isnull().sum())


# Create dataframe per hour
#apr_per_hour = combined_apr.groupby(['month', 'day', 'hour', 'description']).mean()
#apr_per_hour = apr_per_hour.reset_index()
#print(apr_per_hour.head())

# Create dataframe per day
#combined_apr.drop('hour', axis=1, inplace=True)
#apr_per_day = combined_apr.groupby(['month', 'day', 'description']).mean()
#apr_per_day = apr_per_day.reset_index()
#print(apr_per_day.head())

# Create dataframe per month
#combined_apr.drop('day', axis=1, inplace=True)
#apr_per_month = combined_apr.groupby(['month', 'description']).mean()
#apr_per_month = apr_per_month.reset_index()
#print(apr_per_month.head())

#del combined_apr

Unnamed: 0,#object_id,description,result_timestamp,lamax,lamax_unit,laeq,laeq_unit,lceq,lceq_unit,lcpeak,lcpeak_unit,month,day,hour
0,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:00.520,62.5,dB(A),59.3,dB(A),64.56,dB(C),78.13,dB(C),4,1,0
1,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:01.520,63.3,dB(A),61.3,dB(A),65.35,dB(C),78.09,dB(C),4,1,0
2,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:02.520,61.4,dB(A),59.1,dB(A),64.18,dB(C),76.63,dB(C),4,1,0
3,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:03.510,58.9,dB(A),56.6,dB(A),63.58,dB(C),75.74,dB(C),4,1,0
4,255439,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:00:04.510,59.6,dB(A),57.2,dB(A),63.32,dB(C),77.86,dB(C),4,1,0


- May

In [2]:
# Combining the datasets 
#combined_may = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_may['result_timestamp'] = pd.to_datetime(combined_may['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_may['month'] = combined_may['result_timestamp'].dt.month
#combined_may['day'] = combined_may['result_timestamp'].dt.day
#combined_may['hour'] = combined_may['result_timestamp'].dt.hour
#combined_may['minute'] = combined_may['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_may.columns) - set(columns_to_keep)
#combined_may.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_may.isnull().sum())


# forward fill missing values 
#combined_may.ffill(inplace=True)

# check whether there are missing values left
#print(combined_may.isnull().sum())


# Create dataframe per hour
#may_per_hour = combined_may.groupby(['month', 'day', 'hour', 'description']).mean()
#may_per_hour = may_per_hour.reset_index()
#print(may_per_hour.head())

# Create dataframe per day
#combined_may.drop('hour', axis=1, inplace=True)
#may_per_day = combined_may.groupby(['month', 'day', 'description']).mean()
#may_per_day = may_per_day.reset_index()
#print(may_per_day.head())

# Create dataframe per month
#combined_may.drop('day', axis=1, inplace=True)
#may_per_month = combined_may.groupby(['month', 'description']).mean()
#may_per_month = may_per_month.reset_index()
#print(may_per_month.head())

#del combined_may

description     0
lamax          11
laeq           11
month           0
day             0
hour            0
dtype: int64


- June

In [5]:
# Combining the datasets 
#combined_jun = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_jun['result_timestamp'] = pd.to_datetime(combined_jun['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_jun['month'] = combined_jun['result_timestamp'].dt.month
#combined_jun['day'] = combined_jun['result_timestamp'].dt.day
#combined_jun['hour'] = combined_jun['result_timestamp'].dt.hour
#combined_jun['minute'] = combined_jun['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_jun.columns) - set(columns_to_keep)
#combined_jun.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_jun.isnull().sum())


# Create dataframe per hour
#jun_per_hour = combined_jun.groupby(['month', 'day', 'hour', 'description']).mean()
#jun_per_hour = jun_per_hour.reset_index()
#print(jun_per_hour.head())

# Create dataframe per day
#combined_jun.drop('hour', axis=1, inplace=True)
#jun_per_day = combined_jun.groupby(['month', 'day', 'description']).mean()
#jun_per_day = jun_per_day.reset_index()
#print(jun_per_day.head())

# Create dataframe per month
#combined_jun.drop('day', axis=1, inplace=True)
#jun_per_month = combined_jun.groupby(['month', 'description']).mean()
#jun_per_month = jun_per_month.reset_index()
#print(jun_per_month.head())

#del combined_jun

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


- July

In [7]:
# Combining the datasets for 
#combined_jul = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_jul['result_timestamp'] = pd.to_datetime(combined_jul['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_jul['month'] = combined_jul['result_timestamp'].dt.month
#combined_jul['day'] = combined_jul['result_timestamp'].dt.day
#combined_jul['hour'] = combined_jul['result_timestamp'].dt.hour
#combined_jul['minute'] = combined_jul['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_jul.columns) - set(columns_to_keep)
#combined_jul.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_jul.isnull().sum())


# forward fill missing values 
#combined_jul.ffill(inplace=True)

# check whether there are missing values left
#print(combined_jul.isnull().sum())


# Create dataframe per hour
#jul_per_hour = combined_jul.groupby(['month', 'day', 'hour', 'description']).mean()
#jul_per_hour = jul_per_hour.reset_index()
#print(jul_per_hour.head())

# Create dataframe per day
#combined_jul.drop('hour', axis=1, inplace=True)
#jul_per_day = combined_jul.groupby(['month', 'day', 'description']).mean()
#jul_per_day = jul_per_day.reset_index()
#print(jul_per_day.head())

# Create dataframe per month
#combined_jul.drop('day', axis=1, inplace=True)
#jul_per_month = combined_jul.groupby(['month', 'description']).mean()
#jul_per_month = jul_per_month.reset_index()
#print(jul_per_month.head())

#del combined_jul

description    0
lamax          5
laeq           5
month          0
day            0
hour           0
dtype: int64


- August

In [10]:
# Combining the datasets for
#combined_aug = pd.concat(dfs, ignore_index=True)
#del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
#combined_aug['result_timestamp'] = pd.to_datetime(combined_aug['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
#combined_aug['month'] = combined_aug['result_timestamp'].dt.month
#combined_aug['day'] = combined_aug['result_timestamp'].dt.day
#combined_aug['hour'] = combined_aug['result_timestamp'].dt.hour
#combined_aug['minute'] = combined_aug['result_timestamp'].dt.minute

# Drop the columns we won't use
#columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
#columns_to_drop = set(combined_aug.columns) - set(columns_to_keep)
#combined_aug.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
#print(combined_aug.isnull().sum())


# Create dataframe per hour
#aug_per_hour = combined_aug.groupby(['month', 'day', 'hour', 'description']).mean()
#aug_per_hour = aug_per_hour.reset_index()
#print(aug_per_hour.head())

# Create dataframe per day
#combined_aug.drop('hour', axis=1, inplace=True)
#aug_per_day = combined_aug.groupby(['month', 'day', 'description']).mean()
#aug_per_day = aug_per_day.reset_index()
#print(aug_per_day.head())

# Create dataframe per month
#combined_aug.drop('day', axis=1, inplace=True)
#aug_per_month = combined_aug.groupby(['month', 'description']).mean()
#aug_per_month = aug_per_month.reset_index()
#print(aug_per_month.head())

#del combined_aug

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


- September

In [12]:
'''
# Combining the datasets 
combined_sep = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_sep['result_timestamp'] = pd.to_datetime(combined_sep['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_sep['month'] = combined_sep['result_timestamp'].dt.month
combined_sep['day'] = combined_sep['result_timestamp'].dt.day
combined_sep['hour'] = combined_sep['result_timestamp'].dt.hour
#combined_sep['minute'] = combined_sep['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_sep.columns) - set(columns_to_keep)
combined_sep.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_sep.isnull().sum())


# forward fill missing values 
combined_sep.ffill(inplace=True)

# check whether there are missing values left
print(combined_sep.isnull().sum())


# Create dataframe per hour
sep_per_hour = combined_sep.groupby(['month', 'day', 'hour', 'description']).mean()
sep_per_hour = sep_per_hour.reset_index()
print(sep_per_hour.head())

# Create dataframe per day
combined_sep.drop('hour', axis=1, inplace=True)
sep_per_day = combined_sep.groupby(['month', 'day', 'description']).mean()
sep_per_day = sep_per_day.reset_index()
print(sep_per_day.head())

# Create dataframe per month
combined_sep.drop('day', axis=1, inplace=True)
sep_per_month = combined_sep.groupby(['month', 'description']).mean()
sep_per_month = sep_per_month.reset_index()
print(sep_per_month.head())

del combined_sep
'''

description     0
lamax           2
laeq           12
month           0
day             0
hour            0
dtype: int64


- October

In [15]:
'''
# Combining the datasets 
combined_oct = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_oct['result_timestamp'] = pd.to_datetime(combined_oct['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_oct['month'] = combined_oct['result_timestamp'].dt.month
combined_oct['day'] = combined_oct['result_timestamp'].dt.day
combined_oct['hour'] = combined_oct['result_timestamp'].dt.hour
#combined_oct['minute'] = combined_oct['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_oct.columns) - set(columns_to_keep)
combined_oct.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_oct.isnull().sum())


# forward fill missing values 
combined_oct.ffill(inplace=True)

# check whether there are missing values left
print(combined_oct.isnull().sum())


# Create dataframe per hour
oct_per_hour = combined_oct.groupby(['month', 'day', 'hour', 'description']).mean()
oct_per_hour = oct_per_hour.reset_index()
print(oct_per_hour.head())

# Create dataframe per day
combined_oct.drop('hour', axis=1, inplace=True)
oct_per_day = combined_oct.groupby(['month', 'day', 'description']).mean()
oct_per_day = oct_per_day.reset_index()
print(oct_per_day.head())

# Create dataframe per month
combined_oct.drop('day', axis=1, inplace=True)
oct_per_month = combined_oct.groupby(['month', 'description']).mean()
oct_per_month = oct_per_month.reset_index()
print(oct_per_month.head())

del combined_oct
'''

description     0
lamax          10
laeq           10
month           0
day             0
hour            0
dtype: int64


- November

In [18]:
''''
# Combining the datasets 
combined_nov = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_nov['result_timestamp'] = pd.to_datetime(combined_nov['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_nov['month'] = combined_nov['result_timestamp'].dt.month
combined_nov['day'] = combined_nov['result_timestamp'].dt.day
combined_nov['hour'] = combined_nov['result_timestamp'].dt.hour
#combined_nov['minute'] = combined_nov['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_nov.columns) - set(columns_to_keep)
combined_nov.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_nov.isnull().sum())


# Create dataframe per hour
nov_per_hour = combined_nov.groupby(['month', 'day', 'hour', 'description']).mean()
nov_per_hour = nov_per_hour.reset_index()
print(nov_per_hour.head())

# Create dataframe per day
combined_nov.drop('hour', axis=1, inplace=True)
nov_per_day = combined_nov.groupby(['month', 'day', 'description']).mean()
nov_per_day = nov_per_day.reset_index()
print(nov_per_day.head())

# Create dataframe per month
combined_nov.drop('day', axis=1, inplace=True)
nov_per_month = combined_nov.groupby(['month', 'description']).mean()
nov_per_month = nov_per_month.reset_index()
print(nov_per_month.head())

del combined_nov
'''

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


- December

In [20]:
'''

# Combining the datasets 
combined_dec = pd.concat(dfs, ignore_index=True)
del dfs # deleting the separate dataframes to minimize memory usage

# extract the month, day, hour, minute of "result_timestamp"
combined_dec['result_timestamp'] = pd.to_datetime(combined_dec['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')
combined_dec['month'] = combined_dec['result_timestamp'].dt.month
combined_dec['day'] = combined_dec['result_timestamp'].dt.day
combined_dec['hour'] = combined_dec['result_timestamp'].dt.hour
#combined_dec['minute'] = combined_dec['result_timestamp'].dt.minute

# Drop the columns we won't use
columns_to_keep = ['description', 'lamax', 'laeq', 'month', 'day', 'hour'] #also minute if we calculate it
columns_to_drop = set(combined_dec.columns) - set(columns_to_keep)
combined_dec.drop(columns=columns_to_drop, inplace=True)

# check for missing values in each column
print(combined_dec.isnull().sum())


# Create dataframe per hour
dec_per_hour = combined_dec.groupby(['month', 'day', 'hour', 'description']).mean()
dec_per_hour = dec_per_hour.reset_index()
print(dec_per_hour.head())

# Create dataframe per day
combined_dec.drop('hour', axis=1, inplace=True)
dec_per_day = combined_dec.groupby(['month', 'day', 'description']).mean()
dec_per_day = dec_per_day.reset_index()
print(dec_per_day.head())

# Create dataframe per month
combined_dec.drop('day', axis=1, inplace=True)
dec_per_month = combined_dec.groupby(['month', 'description']).mean()
dec_per_month = dec_per_month.reset_index()
print(dec_per_month.head())

del combined_dec
'''

description    0
lamax          0
laeq           0
month          0
day            0
hour           0
dtype: int64


Now that we have exported the preprocessed dataframes for the noise and weather data of 2022, we can just use these files instead of loading all 112 files from the S3 bucket each time, as this takes a lot of time.

## OLD PREPROCESSING

### Reading in the data from the S3 bucket (don't forget to pip install boto3)

In [None]:
# # meteo data
# Q1_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q1.csv')
# Q2_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q2.csv')
# Q3_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q3.csv')
# Q4_2022 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Meteo+data/LC_2022Q4.csv')

In [None]:
# REMARK: this is the 'old' noise data, don't run this

# noise data
# exp40_naamse35 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# exp40_naamse57 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# exp40_naamse62 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# exp40_calvarie = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# exp40_naamse81 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# exp40_park = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# exp40_kiosk = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# exp40_vrijt = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# exp40_his = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_40/csv_results_40_303910_mp-04-his-hears.csv', header=0, sep=';')

# exp41_naamse35 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# exp41_naamse57 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# exp41_naamse62 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# exp41_calvarie = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# exp41_naamse81 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# exp41_park = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# exp41_kiosk = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# exp41_vrijt = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# exp41_his = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_41/csv_results_41_303910_mp-04-his-hears.csv', header=0, sep=';')

# exp42_naamse35 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# exp42_naamse57 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# exp42_naamse62 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# exp42_calvarie = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# exp42_naamse81 = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# exp42_park = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# exp42_kiosk = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# exp42_vrijt = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# exp42_his = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/export_42/csv_results_42_303910_mp-04-his-hears.csv', header=0, sep=';')


In [None]:
# noise_columns = ["#object_id", "description", "result_timestamp", "lamax", "laeq"]
# naamse35_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';', usecols=noise_columns)

In [None]:
# # updated noise data - January
# naamse35_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_jan = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jan/Jan/csv_results_42_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - February
# naamse35_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_feb = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Feb/Feb/csv_results_42_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - March
# naamse35_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_mar = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/March/March/csv_results_44_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - April
# naamse35_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_apr = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/April/April/csv_results_45_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - May
# naamse35_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_may = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/May/May/csv_results_46_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - June
# naamse35_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_jun = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/June/June/csv_results_47_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - July
# naamse35_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_jul = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Jul/Jul/csv_results_48_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - August
# naamse35_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_aug = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Aug/Aug/csv_results_49_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - September
# naamse35_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_sep = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Sep/Sep/csv_results_50_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - October
# naamse35_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_oct = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Oct/Oct/csv_results_51_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - November
# naamse35_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_nov = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Nov/Nov/csv_results_52_303910_mp-04-his-hears.csv', header=0, sep=';')

In [None]:
# # updated noise data - December
# naamse35_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255439_mp-01-naamsestraat-35-maxim.csv', header=0, sep=';')
# naamse57_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255440_mp-02-naamsestraat-57-xior.csv', header=0, sep=';')
# naamse62_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255441_mp-03-naamsestraat-62-taste.csv', header=0, sep=';')
# calvarie_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255442_mp-05-calvariekapel-ku-leuven.csv', header=0, sep=';')
# park_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255443_mp-06-parkstraat-2-la-filosovia.csv', header=0, sep=';')
# naamse81_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255444_mp-07-naamsestraat-81.csv', header=0, sep=';')
# kiosk_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_255445_mp-08-kiosk-stadspark.csv', header=0, sep=';')
# vrijt_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_280324_mp08bis---vrijthof.csv', header=0, sep=';')
# his_dec = pd.read_csv('https://mda-georgia-bucket.s3.eu-central-1.amazonaws.com/Noise+data/Dec/Dec/csv_results_53_303910_mp-04-his-hears.csv', header=0, sep=';')

### Combining and aggregating the meteo data

In [None]:
# # combine meteo dataset 
# meteocombined = pd.concat([Q1_2022, Q2_2022, Q3_2022, Q4_2022], axis=0)
# meteocombined.head()

In [None]:
# check for missing values in each column
# print(meteocombined.isnull().sum())

In [None]:
# # aggregate meteo data by day
# avg_meteo_combined = meteocombined.groupby(['Year','Month', 'Day']).mean()
# avg_meteo_combined = avg_meteo_combined.reset_index()
# avg_meteo_combined.head()


In [None]:
# month_max_value = avg_meteo_combined['Month'].max()
# print(f"This combined meteo dataset contains the weather data for all {month_max_value} months.")

### Combining and aggregating the noise data

- January

In [None]:
# # combine noise data for January together
# noise_jan_combined = pd.concat([naamse35_jan, naamse57_jan, naamse62_jan, calvarie_jan, park_jan, naamse81_jan, kiosk_jan, vrijt_jan, his_jan], axis=0)
# noise_jan_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_jan_combined['result_timestamp'] = pd.to_datetime(noise_jan_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_jan_combined['result_date'] = noise_jan_combined['result_timestamp'].dt.date
# noise_jan_combined['result_month'] = noise_jan_combined['result_timestamp'].dt.month
# noise_jan_combined['result_day'] = noise_jan_combined['result_timestamp'].dt.day
# noise_jan_combined['result_hour'] = noise_jan_combined['result_timestamp'].dt.hour
# noise_jan_combined['result_minute'] = noise_jan_combined['result_timestamp'].dt.minute

# noise_jan_combined.head()

In [None]:
# # aggregate the data by day
# avg_jan_combined = noise_jan_combined.groupby(['result_date','description']).mean()
# avg_jan_combined = avg_jan_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_jan_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_jan_combined.head(150)

In [None]:
# check for missing values in each column
# print(avg_jan_combined.isnull().sum())

- February

In [None]:
# # combine noise data for February together
# noise_feb_combined = pd.concat([naamse35_feb, naamse57_feb, naamse62_feb, calvarie_feb, park_feb, naamse81_feb, kiosk_feb, vrijt_feb, his_feb], axis=0)
# noise_feb_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_feb_combined['result_timestamp'] = pd.to_datetime(noise_feb_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_feb_combined['result_date'] = noise_feb_combined['result_timestamp'].dt.date
# noise_feb_combined['result_month'] = noise_feb_combined['result_timestamp'].dt.month
# noise_feb_combined['result_day'] = noise_feb_combined['result_timestamp'].dt.day
# noise_feb_combined['result_hour'] = noise_feb_combined['result_timestamp'].dt.hour
# noise_feb_combined['result_minute'] = noise_feb_combined['result_timestamp'].dt.minute

# noise_feb_combined.head()

In [None]:
# # aggregate the data by day
# avg_feb_combined = noise_feb_combined.groupby(['result_date','description']).mean()
# avg_feb_combined = avg_feb_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_feb_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_feb_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_feb_combined.isnull().sum())

- March

In [None]:
# # combine noise data for March together
# noise_mar_combined = pd.concat([naamse35_mar, naamse57_mar, naamse62_mar, calvarie_mar, park_mar, naamse81_mar, kiosk_mar, vrijt_mar, his_mar], axis=0)
# noise_mar_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_mar_combined['result_timestamp'] = pd.to_datetime(noise_mar_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_mar_combined['result_date'] = noise_mar_combined['result_timestamp'].dt.date
# noise_mar_combined['result_month'] = noise_mar_combined['result_timestamp'].dt.month
# noise_mar_combined['result_day'] = noise_mar_combined['result_timestamp'].dt.day
# noise_mar_combined['result_hour'] = noise_mar_combined['result_timestamp'].dt.hour
# noise_mar_combined['result_minute'] = noise_mar_combined['result_timestamp'].dt.minute

# noise_mar_combined.head()

In [None]:
# # aggregate the data by day
# avg_mar_combined = noise_mar_combined.groupby(['result_date','description']).mean()
# avg_mar_combined = avg_mar_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_mar_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_mar_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_mar_combined.isnull().sum())

- April 

In [None]:
# # combine noise data for April together
# noise_apr_combined = pd.concat([naamse35_apr, naamse57_apr, naamse62_apr, calvarie_apr, park_apr, naamse81_apr, kiosk_apr, vrijt_apr, his_apr], axis=0)
# noise_apr_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_apr_combined['result_timestamp'] = pd.to_datetime(noise_apr_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_apr_combined['result_date'] = noise_apr_combined['result_timestamp'].dt.date
# noise_apr_combined['result_month'] = noise_apr_combined['result_timestamp'].dt.month
# noise_apr_combined['result_day'] = noise_apr_combined['result_timestamp'].dt.day
# noise_apr_combined['result_hour'] = noise_apr_combined['result_timestamp'].dt.hour
# noise_apr_combined['result_minute'] = noise_apr_combined['result_timestamp'].dt.minute

# noise_apr_combined.head()

In [None]:
# # aggregate the data by day
# avg_apr_combined = noise_apr_combined.groupby(['result_date','description']).mean()
# avg_apr_combined = avg_apr_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_apr_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_apr_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_apr_combined.isnull().sum())

- May 

In [None]:
# # combine noise data for May together
# noise_may_combined = pd.concat([naamse35_may, naamse57_may, naamse62_may, calvarie_may, park_may, naamse81_may, kiosk_may, vrijt_may, his_may], axis=0)
# noise_may_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_may_combined['result_timestamp'] = pd.to_datetime(noise_may_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_may_combined['result_date'] = noise_may_combined['result_timestamp'].dt.date
# noise_may_combined['result_month'] = noise_may_combined['result_timestamp'].dt.month
# noise_may_combined['result_day'] = noise_may_combined['result_timestamp'].dt.day
# noise_may_combined['result_hour'] = noise_may_combined['result_timestamp'].dt.hour
# noise_may_combined['result_minute'] = noise_may_combined['result_timestamp'].dt.minute

# noise_may_combined.head()

In [None]:
# # aggregate the data by day
# avg_may_combined = noise_may_combined.groupby(['result_date','description']).mean()
# avg_may_combined = avg_may_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_may_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_may_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_may_combined.isnull().sum())

- June

In [None]:
# # combine noise data for June together
# noise_jun_combined = pd.concat([naamse35_jun, naamse57_jun, naamse62_jun, calvarie_jun, park_jun, naamse81_jun, kiosk_jun, vrijt_jun, his_jun], axis=0)
# noise_jun_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_jun_combined['result_timestamp'] = pd.to_datetime(noise_jun_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_jun_combined['result_date'] = noise_jun_combined['result_timestamp'].dt.date
# noise_jun_combined['result_month'] = noise_jun_combined['result_timestamp'].dt.month
# noise_jun_combined['result_day'] = noise_jun_combined['result_timestamp'].dt.day
# noise_jun_combined['result_hour'] = noise_jun_combined['result_timestamp'].dt.hour
# noise_jun_combined['result_minute'] = noise_jun_combined['result_timestamp'].dt.minute

# noise_jun_combined.head()

In [None]:
# aggregate the data by day
# avg_jun_combined = noise_jun_combined.groupby(['result_date','description']).mean()
# avg_jun_combined = avg_jun_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_jun_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_jun_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_jun_combined.isnull().sum())

- July

In [None]:
# # combine noise data for July together
# noise_jul_combined = pd.concat([naamse35_jul, naamse57_jul, naamse62_jul, calvarie_jul, park_jul, naamse81_jul, kiosk_jul, vrijt_jul, his_jul], axis=0)
# noise_jul_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_jul_combined['result_timestamp'] = pd.to_datetime(noise_jul_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_jul_combined['result_date'] = noise_jul_combined['result_timestamp'].dt.date
# noise_jul_combined['result_month'] = noise_jul_combined['result_timestamp'].dt.month
# noise_jul_combined['result_day'] = noise_jul_combined['result_timestamp'].dt.day
# noise_jul_combined['result_hour'] = noise_jul_combined['result_timestamp'].dt.hour
# noise_jul_combined['result_minute'] = noise_jul_combined['result_timestamp'].dt.minute

# noise_jul_combined.head()

In [None]:
# # aggregate the data by day
# avg_jul_combined = noise_jul_combined.groupby(['result_date','description']).mean()
# avg_jul_combined = avg_jul_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_jul_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_jul_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_jul_combined.isnull().sum())

- August

In [None]:
# # combine noise data for August together
# noise_aug_combined = pd.concat([naamse35_aug, naamse57_aug, naamse62_aug, calvarie_aug, park_aug, naamse81_aug, kiosk_aug, vrijt_aug, his_aug], axis=0)
# noise_aug_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_aug_combined['result_timestamp'] = pd.to_datetime(noise_aug_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_aug_combined['result_date'] = noise_aug_combined['result_timestamp'].dt.date
# noise_aug_combined['result_month'] = noise_aug_combined['result_timestamp'].dt.month
# noise_aug_combined['result_day'] = noise_aug_combined['result_timestamp'].dt.day
# noise_aug_combined['result_hour'] = noise_aug_combined['result_timestamp'].dt.hour
# noise_aug_combined['result_minute'] = noise_aug_combined['result_timestamp'].dt.minute

# noise_aug_combined.head()

In [None]:
# # aggregate the data by day
# avg_aug_combined = noise_aug_combined.groupby(['result_date','description']).mean()
# avg_aug_combined = avg_aug_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_aug_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_aug_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_aug_combined.isnull().sum())

- September

In [None]:
# # combine noise data for September together
# noise_sep_combined = pd.concat([naamse35_sep, naamse57_sep, naamse62_sep, calvarie_sep, park_sep, naamse81_sep, kiosk_sep, vrijt_sep, his_sep], axis=0)
# noise_sep_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_sep_combined['result_timestamp'] = pd.to_datetime(noise_sep_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_sep_combined['result_date'] = noise_sep_combined['result_timestamp'].dt.date
# noise_sep_combined['result_month'] = noise_sep_combined['result_timestamp'].dt.month
# noise_sep_combined['result_day'] = noise_sep_combined['result_timestamp'].dt.day
# noise_sep_combined['result_hour'] = noise_sep_combined['result_timestamp'].dt.hour
# noise_sep_combined['result_minute'] = noise_sep_combined['result_timestamp'].dt.minute

# noise_sep_combined.head()

In [None]:
# # aggregate the data by day
# avg_sep_combined = noise_sep_combined.groupby(['result_date','description']).mean()
# avg_sep_combined = avg_sep_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_sep_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_sep_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_sep_combined.isnull().sum())

- October

In [None]:
# # combine noise data for Octber together
# noise_oct_combined = pd.concat([naamse35_oct, naamse57_oct, naamse62_oct, calvarie_oct, park_oct, naamse81_oct, kiosk_oct, vrijt_oct, his_oct], axis=0)
# noise_oct_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_oct_combined['result_timestamp'] = pd.to_datetime(noise_oct_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_oct_combined['result_date'] = noise_oct_combined['result_timestamp'].dt.date
# noise_oct_combined['result_month'] = noise_oct_combined['result_timestamp'].dt.month
# noise_oct_combined['result_day'] = noise_oct_combined['result_timestamp'].dt.day
# noise_oct_combined['result_hour'] = noise_oct_combined['result_timestamp'].dt.hour
# noise_oct_combined['result_minute'] = noise_oct_combined['result_timestamp'].dt.minute

# noise_oct_combined.head()

In [None]:
# # aggregate the data by day
# avg_oct_combined = noise_oct_combined.groupby(['result_date','description']).mean()
# avg_oct_combined = avg_oct_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_oct_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_oct_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_oct_combined.isnull().sum())

- November

In [None]:
# # combine noise data for November together
# noise_nov_combined = pd.concat([naamse35_nov, naamse57_nov, naamse62_nov, calvarie_nov, park_nov, naamse81_nov, kiosk_nov, vrijt_nov, his_nov], axis=0)
# noise_nov_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_nov_combined['result_timestamp'] = pd.to_datetime(noise_nov_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_nov_combined['result_date'] = noise_nov_combined['result_timestamp'].dt.date
# noise_nov_combined['result_month'] = noise_nov_combined['result_timestamp'].dt.month
# noise_nov_combined['result_day'] = noise_nov_combined['result_timestamp'].dt.day
# noise_nov_combined['result_hour'] = noise_nov_combined['result_timestamp'].dt.hour
# noise_nov_combined['result_minute'] = noise_nov_combined['result_timestamp'].dt.minute

# noise_nov_combined.head()

In [None]:
# # aggregate the data by day
# avg_nov_combined = noise_nov_combined.groupby(['result_date','description']).mean()
# avg_nov_combined = avg_nov_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_nov_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_nov_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_nov_combined.isnull().sum())

- December

In [None]:
# # combine noise data for December together
# noise_dec_combined = pd.concat([naamse35_dec, naamse57_dec, naamse62_dec, calvarie_dec, park_dec, naamse81_dec, kiosk_dec, vrijt_dec, his_dec], axis=0)
# noise_dec_combined.head()

In [None]:
# # extract the date, month, hour, minute of "result_timestamp"
# noise_dec_combined['result_timestamp'] = pd.to_datetime(noise_dec_combined['result_timestamp'], format='%d/%m/%Y %H:%M:%S.%f')


# noise_dec_combined['result_date'] = noise_dec_combined['result_timestamp'].dt.date
# noise_dec_combined['result_month'] = noise_dec_combined['result_timestamp'].dt.month
# noise_dec_combined['result_day'] = noise_dec_combined['result_timestamp'].dt.day
# noise_dec_combined['result_hour'] = noise_dec_combined['result_timestamp'].dt.hour
# noise_dec_combined['result_minute'] = noise_dec_combined['result_timestamp'].dt.minute

# noise_dec_combined.head()

In [None]:
# # aggregate the data by day
# avg_dec_combined = noise_dec_combined.groupby(['result_date','description']).mean()
# avg_dec_combined = avg_dec_combined.reset_index()
# columns_to_drop = ['result_hour', 'result_minute']
# avg_dec_combined.drop(columns_to_drop, axis=1, inplace=True)
# avg_dec_combined.head(150)

In [None]:
# # check for missing values in each column
# print(avg_dec_combined.isnull().sum())

Combining monthly noise level datasets into a yearly dataset

In [None]:
# List of the monthly datasets
datasets = [avg_jan_combined, avg_feb_combined, avg_mar_combined, avg_apr_combined, avg_may_combined, avg_jun_combined, avg_jul_combined, avg_aug_combined, avg_sep_combined, avg_oct_combined, avg_nov_combined, avg_dec_combined]

# Concatenate the datasets vertically
avg_year_combined = pd.concat(datasets, ignore_index=True)

# Sort the combined dataset by 'result_date' in ascending order
avg_year_combined.sort_values(by='result_date', inplace=True)

# Reset the index of the combined dataset
avg_year_combined.reset_index(drop=True, inplace=True)

# Display the combined and sorted yearly dataset
avg_year_combined.head(2000)
