In [2]:
import pandas as pd

file_path = '../../00_Uncleaned_Data/Target/100049.csv'
data = pd.read_csv(file_path, delimiter=';')

# Display the first few rows of the dataframe
data = data.sort_values(by='Datum/Zeit').reset_index(drop=True)
data.head()

Unnamed: 0,Datum/Zeit,timestamp_text,PM10 (Stundenmittelwerte),PM2.5 (Stundenmittelwerte),O3 (Stundenmittelwerte),NO2 (Stundenmittelwerte),geo_point_2d
0,2021-11-30T23:00:00+00:00,2021-12-01 00:00:00,7.456,7.023,38.615,24.832,"47.5659354, 7.58192"
1,2021-12-01T00:00:00+00:00,2021-12-01 01:00:00,4.832,2.241,67.784,3.288,"47.5659354, 7.58192"
2,2021-12-01T01:00:00+00:00,2021-12-01 02:00:00,4.682,2.092,70.599,2.459,"47.5659354, 7.58192"
3,2021-12-01T02:00:00+00:00,2021-12-01 03:00:00,5.035,2.398,70.212,2.954,"47.5659354, 7.58192"
4,2021-12-01T03:00:00+00:00,2021-12-01 04:00:00,5.102,2.345,75.184,2.432,"47.5659354, 7.58192"


In [2]:
missing_values = data.isnull().sum()
print(missing_values)

Datum/Zeit                      0
timestamp_text                  0
PM10 (Stundenmittelwerte)     263
PM2.5 (Stundenmittelwerte)     23
O3 (Stundenmittelwerte)        25
NO2 (Stundenmittelwerte)       30
geo_point_2d                    0
dtype: int64


In [3]:
# Convert 'Datum/Zeit' column to datetime
data['Datum/Zeit'] = pd.to_datetime(data['Datum/Zeit'])

# Extract hour and weekday from 'Datum/Zeit' column
data['Hour'] = data['Datum/Zeit'].dt.hour
data['Weekday'] = data['Datum/Zeit'].dt.weekday

# Function to fill missing values with mean of the same hour on the same weekday
def fill_missing_values(df, column):
    return df[column].fillna(df.groupby(['Hour', 'Weekday'])[column].transform('mean'))

# Fill missing values for PM10 and PM2.5 columns
data['PM10 (Stundenmittelwerte)'] = fill_missing_values(data, 'PM10 (Stundenmittelwerte)')
data['PM2.5 (Stundenmittelwerte)'] = fill_missing_values(data, 'PM2.5 (Stundenmittelwerte)')

# Drop the temporary columns
data.drop(columns=['Hour', 'Weekday'], inplace=True)

# Display the first few rows of the dataframe to verify
data.head()

Unnamed: 0,Datum/Zeit,timestamp_text,PM10 (Stundenmittelwerte),PM2.5 (Stundenmittelwerte),O3 (Stundenmittelwerte),NO2 (Stundenmittelwerte),geo_point_2d
0,2021-11-30 23:00:00+00:00,2021-12-01 00:00:00,7.456,7.023,38.615,24.832,"47.5659354, 7.58192"
1,2021-12-01 00:00:00+00:00,2021-12-01 01:00:00,4.832,2.241,67.784,3.288,"47.5659354, 7.58192"
2,2021-12-01 01:00:00+00:00,2021-12-01 02:00:00,4.682,2.092,70.599,2.459,"47.5659354, 7.58192"
3,2021-12-01 02:00:00+00:00,2021-12-01 03:00:00,5.035,2.398,70.212,2.954,"47.5659354, 7.58192"
4,2021-12-01 03:00:00+00:00,2021-12-01 04:00:00,5.102,2.345,75.184,2.432,"47.5659354, 7.58192"


In [4]:
missing_values_after_fill = data.isnull().sum()
print(missing_values_after_fill)

Datum/Zeit                     0
timestamp_text                 0
PM10 (Stundenmittelwerte)      0
PM2.5 (Stundenmittelwerte)     0
O3 (Stundenmittelwerte)       25
NO2 (Stundenmittelwerte)      30
geo_point_2d                   0
dtype: int64


In [5]:
# Create a complete date range from the minimum to the maximum date in the data
complete_date_range = pd.date_range(start=data['Datum/Zeit'].min(), end=data['Datum/Zeit'].max(), freq='H')

# Find the missing timestamps by comparing the complete date range with the 'Datum' column
missing_timestamps = complete_date_range.difference(data['Datum/Zeit'])

print("Missing timestamps:")
print(missing_timestamps)
# Count the number of missing hours
missing_hours_count = len(missing_timestamps)
print(f"Number of missing hours: {missing_hours_count}")

Missing timestamps:
DatetimeIndex(['2022-02-17 15:00:00+00:00', '2022-02-17 16:00:00+00:00',
               '2022-02-17 17:00:00+00:00', '2022-03-27 01:00:00+00:00',
               '2022-03-27 02:00:00+00:00', '2022-08-23 15:00:00+00:00',
               '2022-08-23 16:00:00+00:00', '2022-08-23 17:00:00+00:00',
               '2022-10-30 01:00:00+00:00', '2023-03-26 01:00:00+00:00',
               '2023-03-26 02:00:00+00:00', '2023-04-26 09:00:00+00:00',
               '2023-10-29 01:00:00+00:00', '2024-01-19 15:00:00+00:00',
               '2024-03-31 01:00:00+00:00', '2024-03-31 02:00:00+00:00',
               '2024-05-07 12:00:00+00:00', '2024-07-07 21:00:00+00:00',
               '2024-07-07 22:00:00+00:00', '2024-07-07 23:00:00+00:00',
               '2024-07-08 00:00:00+00:00', '2024-07-08 01:00:00+00:00',
               '2024-07-08 02:00:00+00:00', '2024-07-08 03:00:00+00:00',
               '2024-07-08 04:00:00+00:00', '2024-07-08 05:00:00+00:00',
               '2024-07-08 06:0

  complete_date_range = pd.date_range(start=data['Datum/Zeit'].min(), end=data['Datum/Zeit'].max(), freq='H')


In [6]:
# Reindex the dataframe to include the complete date range
data = data.set_index('Datum/Zeit').reindex(complete_date_range).reset_index()
data.rename(columns={'index': 'Datum/Zeit'}, inplace=True)

# Interpolate the missing values
data['PM10 (Stundenmittelwerte)'] = data['PM10 (Stundenmittelwerte)'].interpolate()
data['PM2.5 (Stundenmittelwerte)'] = data['PM2.5 (Stundenmittelwerte)'].interpolate()

# Display the first few rows of the dataframe to verify
data.head()

Unnamed: 0,Datum/Zeit,timestamp_text,PM10 (Stundenmittelwerte),PM2.5 (Stundenmittelwerte),O3 (Stundenmittelwerte),NO2 (Stundenmittelwerte),geo_point_2d
0,2021-11-30 23:00:00+00:00,2021-12-01 00:00:00,7.456,7.023,38.615,24.832,"47.5659354, 7.58192"
1,2021-12-01 00:00:00+00:00,2021-12-01 01:00:00,4.832,2.241,67.784,3.288,"47.5659354, 7.58192"
2,2021-12-01 01:00:00+00:00,2021-12-01 02:00:00,4.682,2.092,70.599,2.459,"47.5659354, 7.58192"
3,2021-12-01 02:00:00+00:00,2021-12-01 03:00:00,5.035,2.398,70.212,2.954,"47.5659354, 7.58192"
4,2021-12-01 03:00:00+00:00,2021-12-01 04:00:00,5.102,2.345,75.184,2.432,"47.5659354, 7.58192"


In [7]:
# Create a complete date range from the minimum to the maximum date in the data
complete_date_range = pd.date_range(start=data['Datum/Zeit'].min(), end=data['Datum/Zeit'].max(), freq='H')

# Find the missing timestamps by comparing the complete date range with the 'Datum' column
missing_timestamps = complete_date_range.difference(data['Datum/Zeit'])

print("Missing timestamps:")
print(missing_timestamps)

# Count the number of missing hours
missing_hours_count = len(missing_timestamps)
print(f"Number of missing hours: {missing_hours_count}")

Missing timestamps:
DatetimeIndex([], dtype='datetime64[ns, UTC]', freq='h')
Number of missing hours: 0


  complete_date_range = pd.date_range(start=data['Datum/Zeit'].min(), end=data['Datum/Zeit'].max(), freq='H')


In [None]:
data.to_csv('../02_Cleaned/Target/10Targcleaned.csv', index=False)