In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import pickle


In [2]:
#Grunddaten
Wetterdaten = pd.read_csv("../02_Cleaned/Features/01Wetterdaten_cleaned.csv")
Verkehrsdaten = pd.read_csv("../02_Cleaned/Features/02Verkehrsdaten.csv")
Parkhäuser = pd.read_csv("../02_Cleaned/Features/03Parkhäuser.csv")
Flugbewegungen = pd.read_csv("../02_Cleaned/Features/04Airport_traffic_hourly_cleaned.csv")
Gasverbrauch = pd.read_csv("../02_Cleaned/Features/05Gasverbrauch_cleaned.csv")
Stromverbrauch = pd.read_csv("../02_Cleaned/Features/06Stromverbrauch_hourly_cleaned.csv")
#Zusätzliche Daten (Luftdaten von anderen Stationen in Basel)
Basel_Binningen = pd.read_csv("../02_Cleaned/Target/Luft_Basel-Binningen_cleaned.csv")
Chrischona = pd.read_csv("../02_Cleaned/Target/Luft_Chrischona_cleaned.csv")
Feldbergstrasse = pd.read_csv("../02_Cleaned/Target/Luft_Feldbergstrasse_cleaned.csv")


# Ensure the index is a DatetimeIndex (it should already be with UTC timezone)
Basel_Binningen.index = pd.to_datetime(Basel_Binningen.index, utc=True)
Chrischona.index = pd.to_datetime(Chrischona.index, utc=True)
Feldbergstrasse.index = pd.to_datetime(Feldbergstrasse.index, utc=True)

# Create a one-hour lag for the specified dataframes
Basel_Binningen_lagged = Basel_Binningen.shift(periods=1, freq='h')
Chrischona_lagged = Chrischona.shift(periods=1, freq='h')
Feldbergstrasse_lagged = Feldbergstrasse.shift(periods=1, freq='h')

Basel_Binningen.reset_index(inplace=True)
Chrischona.reset_index(inplace=True)
Feldbergstrasse.reset_index(inplace=True)

# Display the first few rows of the lagged dataframes
print(Basel_Binningen_lagged.head())
print(Chrischona_lagged.head())
print(Feldbergstrasse_lagged.head())


In [3]:

Parkhäuser.rename(columns={"RoundedTime": "Datum"}, inplace=True)
Parkhäuser.head()
#Flugbewegungen.drop(columns=["Unnamed: 0"], inplace=True)
Flugbewegungen.head()
#Gasverbrauch.head()
#Schallmessungen.drop(columns=["Unnamed: 0"], inplace=True)
#Schallmessungen.head()
#Stromverbrauch.drop(columns=["Unnamed: 0"], inplace=True)
#Stromverbrauch.head()
#Wetterdaten.head()
#Verkehrsdaten.rename(columns={"DateTimeFrom": "Datum"}, inplace=True)
#Verkehrsdaten.head()

Parkhäuser.head()
Verkehrsdaten = Verkehrsdaten.rename(columns={"DateTimeFrom": "Datum"})
print(Verkehrsdaten['Datum'].isna().sum())




0


In [4]:
#standardise the data


dfs = [Parkhäuser, Flugbewegungen, Gasverbrauch, Stromverbrauch, Wetterdaten, Verkehrsdaten, Basel_Binningen, Chrischona, Feldbergstrasse]

# Dictionary to store the mean and standard deviation for each dataframe
scalers = {}

# Loop through each dataframe
for i, df in enumerate(dfs):
    numeric_cols = df.select_dtypes(include=[np.number])
    if not numeric_cols.empty:
        if i == 1:  # Exclude 'Hour' column for Flugbewegungen
            numeric_cols = numeric_cols.drop(columns=['Hour'])
        print(f"Processing df{i} with columns: {numeric_cols.columns}")
        scaler = StandardScaler()
        # Fit the scaler on the dataframe and transform the data
        scaled_data = scaler.fit_transform(numeric_cols)
        
        # Save the mean and standard deviation
        scalers[f'df{i}'] = {'mean': scaler.mean_, 'scale': scaler.scale_}
        
        # Replace the original data with the standardized data
        df[numeric_cols.columns] = scaled_data
        dfs[i] = df
    else:
        print(f"df{i} has no numeric columns or is empty.")

# Assign the transformed dataframes back to their original names
Parkhäuser, Flugbewegungen, Gasverbrauch, Stromverbrauch, Wetterdaten, Verkehrsdaten, Basel_Binningen, Chrischona, Feldbergstrasse = dfs

# Save the scalers dictionary for later use
with open('../04_WorkingDatasets/NormalData/scalers.pkl', 'wb') as f:
    pickle.dump(scalers, f)


Processing df0 with columns: Index(['Rebgassechange', 'Clarahuuschange', 'Citychange', 'Storchenchange',
       'Post Baselchange', 'Aeschenchange', 'Bahnhof Südchange',
       'Bad. Bahnhofchange', 'Europechange', 'Claramattechange',
       'Elisabethenchange', 'Steinenchange', 'Kunstmuseumchange',
       'Messechange', 'Anfoschange', 'Centralbahnparkingchange'],
      dtype='object')
Processing df1 with columns: Index(['Unnamed: 0', 'Traffic'], dtype='object')
Processing df2 with columns: Index(['Gasverbrauch'], dtype='object')
Processing df3 with columns: Index(['Stromverbrauch'], dtype='object')
Processing df4 with columns: Index(['Unnamed: 0', 'Basel Temperature [2 m elevation corrected]',
       'Basel Precipitation Total', 'Basel Wind Speed [10 m]',
       'Basel Wind Direction [10 m]'],
      dtype='object')
Processing df5 with columns: Index(['350n_sumPW', '350n_sumLief', '350n_sumLW', '350v_sumPW',
       '350v_sumLief', '350v_sumLW', '352n_sumPW', '352n_sumLief',
       '352

In [5]:
print(type(Verkehrsdaten))
Wetterdaten.drop(columns=["Unnamed: 0"], inplace=True)
Flugbewegungen.drop(columns=["Unnamed: 0"], inplace=True)
Gasverbrauch.sort_values(by=["Datum"], inplace=True)

<class 'pandas.core.frame.DataFrame'>


In [6]:

Flugbewegungen["Datum"].dtype

dtype('O')

In [7]:
# Merge all dataframes on the 'Datum' column
final_df = Parkhäuser.merge(Wetterdaten, on='Datum', how='inner') \
                      .merge(Verkehrsdaten, on='Datum', how='inner') \
                      .merge(Gasverbrauch, on='Datum', how='inner') \
                      .merge(Flugbewegungen, on='Datum', how='inner') \
                      .merge(Stromverbrauch, on='Datum', how='inner') \
                      .merge(Chrischona, on='Datum', how='inner') \
                      .merge(Feldbergstrasse, on='Datum', how='inner') \
                      .merge(Basel_Binningen, on='Datum', how='inner') 


final_df.head(20)

Unnamed: 0,Datum,Rebgassechange,Clarahuuschange,Citychange,Storchenchange,Post Baselchange,Aeschenchange,Bahnhof Südchange,Bad. Bahnhofchange,Europechange,...,O3 [ug/m3],NO2 [ug/m3],PM10 [ug/m3],PM2.5 [ug/m3],CPC [1/cm3],PREC [mm],RAD [W/m2],SO2 [ug/m3],NOX [ug/m3 eq. NO2],TEMP [C]
0,2022-01-01 00:00:00+00:00,-0.489808,-1.162325,-0.163786,-1.110112,-5.4e-05,-0.258093,-0.245491,-0.158272,-0.15127,...,-1.559811,0.366536,2.480706,3.674428,5.133522,-0.152131,-0.633392,0.358628,0.643267,-0.990377
1,2022-01-01 01:00:00+00:00,-0.489808,-0.193898,-0.043944,-0.525891,-0.191296,-0.000173,-0.00019,-0.000462,-0.000203,...,-1.639639,0.503186,5.359952,6.923201,6.721626,-0.152131,-0.633392,1.448165,1.005931,-1.015771
2,2022-01-01 02:00:00+00:00,-0.279837,-0.000213,-0.011259,-0.058515,-5.4e-05,-0.000173,-0.12284,-0.000462,-0.000203,...,-1.639639,0.357426,8.239199,10.615638,6.023355,-0.152131,-0.633392,1.512255,1.055954,-1.053861
3,2022-01-01 03:00:00+00:00,-0.209846,-0.000213,-0.011259,-0.350625,-5.4e-05,-0.000173,-0.00019,-0.000462,-0.000203,...,-1.626867,0.330097,5.405474,7.209436,4.554046,-0.152131,-0.633392,0.871351,0.98092,-1.053861
4,2022-01-01 04:00:00+00:00,-0.139856,-0.000213,0.021425,0.116751,-5.4e-05,-0.000173,0.245112,-0.000462,0.150864,...,-1.68115,0.721826,3.834976,5.119917,2.874267,-0.152131,-0.633392,0.807261,0.799588,-1.053861
5,2022-01-01 05:00:00+00:00,0.140106,-0.000213,0.925692,-0.233781,-5.4e-05,0.128787,-0.00019,0.052141,-0.000203,...,-1.591743,0.503186,1.695457,2.500863,1.220288,-0.152131,-0.633392,0.358628,0.343132,-1.015771
6,2022-01-01 06:00:00+00:00,-0.069865,0.193473,-0.022154,-0.116937,-5.4e-05,0.128787,-0.12284,-0.000462,-0.15127,...,-1.729047,0.503186,1.51337,2.343434,1.349734,-0.152131,-0.633392,0.422719,0.999678,-1.21892
7,2022-01-01 07:00:00+00:00,0.000125,-0.193898,0.054109,-0.058515,-5.4e-05,-0.000173,0.122461,0.052141,-0.15127,...,-1.703502,0.557846,0.841925,1.570598,0.817517,-0.152131,-0.633392,0.230448,0.680784,-1.206223
8,2022-01-01 08:00:00+00:00,0.000125,0.193473,0.141268,-9.3e-05,-5.4e-05,-0.000173,0.613065,-0.053065,-0.15127,...,-1.690729,0.557846,0.659838,1.398857,0.512238,-0.152131,-0.633392,0.166357,0.618256,-1.180829
9,2022-01-01 09:00:00+00:00,-0.069865,1.161899,0.173952,-0.175359,0.573671,-0.000173,0.613065,0.104744,-0.15127,...,-1.614094,0.375646,0.318426,0.883633,0.142987,-0.152131,-0.585557,-0.025914,0.418166,-1.206223


In [8]:
print(final_df.tail(25))

                           Datum  Rebgassechange  Clarahuuschange  Citychange  \
24549  2024-10-19 21:00:00+00:00       -0.979741        -0.000213   -0.632262   
24550  2024-10-19 22:00:00+00:00       -1.399684        -0.000213   -0.250945   
24551  2024-10-19 23:00:00+00:00       -1.119722        -0.968639   -0.109312   
24552  2024-10-20 00:00:00+00:00       -0.139856        -0.000213   -0.076628   
24553  2024-10-20 01:00:00+00:00        0.000125        -0.000213   -0.022154   
24554  2024-10-20 02:00:00+00:00        0.000125        -0.000213    0.010530   
24555  2024-10-20 03:00:00+00:00        0.000125        -0.000213   -0.000365   
24556  2024-10-20 04:00:00+00:00        0.070116         0.968214    0.860323   
24557  2024-10-20 05:00:00+00:00        0.000125        -0.000213   -0.022154   
24558  2024-10-20 06:00:00+00:00        0.140106        -0.000213    0.032320   
24559  2024-10-20 07:00:00+00:00        0.560049        -0.000213    0.250215   
24560  2024-10-20 08:00:00+0

In [9]:
test_dataFrame = [Parkhäuser,
                Wetterdaten,
                Flugbewegungen,
                Gasverbrauch,
                Stromverbrauch,
                Verkehrsdaten]

for i in test_dataFrame:
    print(i['Datum'].max())

2024-10-21 14:00:00+00:00
2024-11-22 23:00:00+00:00
2024-11-22 23:00:00+00:00
2024-11-24 04:00:00+00:00
2024-11-20 22:00:00+00:00
2024-10-20 21:00:00+00:00


In [10]:
final_df.to_csv("../04_WorkingDatasets/NormalData/Working_DataFrame.csv", index=False)

In [11]:

Stromverbrauch["Datum"].dtype
Wetterdaten["Datum"].dtype
Gasverbrauch["Datum"].dtype

Flugbewegungen["Datum"] = Flugbewegungen["Datum"].astype(str)
Flugbewegungen["Datum"].dtype

dtype('O')

In [12]:
Gasverbrauch.sort_values(by=["Datum"], inplace=True)
Gasverbrauch.tail()
Stromverbrauch.sort_values(by=["Datum"], inplace=True)
Stromverbrauch.tail()
Wetterdaten.sort_values(by=["Datum"], inplace=True)
Wetterdaten.tail()
Flugbewegungen.sort_values(by=["Datum"], inplace=True) 
Flugbewegungen.tail()

Unnamed: 0,Datum,Hour,Traffic
25363,2024-11-22 19:00:00+00:00,19,0.084474
25364,2024-11-22 20:00:00+00:00,20,0.39113
25365,2024-11-22 21:00:00+00:00,21,0.544459
25366,2024-11-22 22:00:00+00:00,22,0.084474
25367,2024-11-22 23:00:00+00:00,23,-0.528839


**Check if there are missing timestamps in the dataset**


In [13]:

df = final_df.copy()
# Ensure 'Datum' is a datetime object
df['Datum'] = pd.to_datetime(df['Datum'])

# Create a complete range of hourly timestamps from the first to the last point
full_range = pd.date_range(start=df['Datum'].min(), end=df['Datum'].max(), freq='h')

# Identify missing timestamps
missing_timestamps = full_range.difference(df['Datum'])

# Display the results
if missing_timestamps.empty:
    print("No missing hourly timestamps. All hours are accounted for.")
else:
    print("Missing hourly timestamps:")
    print(missing_timestamps)

No missing hourly timestamps. All hours are accounted for.


In [14]:
# Check for 0 values, NaN values, and '/N' values in all columns
zero_values = (df == 0).sum()
nan_values = df.isna().sum()
slash_n_values = (df == '/N').sum()

# Print the results
print("Zero values in each column:")
print(zero_values)
print("\nNaN values in each column:")
print(nan_values)
print("\n'/N' values in each column:")
print(slash_n_values)

Zero values in each column:
Datum                  0
Rebgassechange         0
Clarahuuschange        0
Citychange             0
Storchenchange         0
                      ..
PREC [mm]              0
RAD [W/m2]             0
SO2 [ug/m3]            0
NOX [ug/m3 eq. NO2]    0
TEMP [C]               0
Length: 165, dtype: int64

NaN values in each column:
Datum                  0
Rebgassechange         0
Clarahuuschange        0
Citychange             0
Storchenchange         0
                      ..
PREC [mm]              0
RAD [W/m2]             0
SO2 [ug/m3]            0
NOX [ug/m3 eq. NO2]    0
TEMP [C]               0
Length: 165, dtype: int64

'/N' values in each column:
Datum                  0
Rebgassechange         0
Clarahuuschange        0
Citychange             0
Storchenchange         0
                      ..
PREC [mm]              0
RAD [W/m2]             0
SO2 [ug/m3]            0
NOX [ug/m3 eq. NO2]    0
TEMP [C]               0
Length: 165, dtype: int64


# PART 2 Feature Combination

In [15]:
features = final_df
target = pd.read_csv("../02_Cleaned/Target/11Target Additive_Combined_Resid_Trend_Target.csv")

In [16]:
target.to_csv("../04_WorkingDatasets/NormalData/Target_Additive.csv", index=False)

In [17]:
# Define all features based on the columns of the features DataFrame
all_features = features.columns.tolist()

# Create a list to store the new features and their correlations
new_feature_correlations = []

# Create a set to keep track of added features
added_features = set()

# Dictionary to store new features
new_features_dict = {}

# Iterate over each pair of features to create new features
for i in range(len(all_features)):
    for j in range(i, len(all_features)):
        feature1 = all_features[i]
        feature2 = all_features[j]
        
        # Skip the 'Datum' column
        if feature1 == 'Datum' or feature2 == 'Datum':
            continue
        
        # Create new features by multiplying and calculating growth
        new_feature_name = f'{feature1}_x_{feature2}'
        new_feature_growth_name = f'{feature1}_growth'
        
        # Multiply features
        new_features_dict[new_feature_name] = features[feature1] * features[feature2]
        
        # Calculate growth (difference to last period)
        new_features_dict[new_feature_growth_name] = features[feature1].diff()

        # Calculate growth (difference to last period) of the multiplied features
        new_feature_mult_growth_name = f'{new_feature_name}_growth'
        new_features_dict[new_feature_mult_growth_name] = new_features_dict[new_feature_name].diff()
        
        # Calculate correlations with the target variable
        correlation_mult = new_features_dict[new_feature_name].corr(target['PM10_Combined_Trend_Residual'])
        correlation_growth = new_features_dict[new_feature_growth_name].corr(target['PM10_Combined_Trend_Residual'])
         
        # Store the correlations if they are high and not already added
        if abs(correlation_mult) > 0.1 and new_feature_name not in added_features:
            new_feature_correlations.append((new_feature_name, correlation_mult))
            added_features.add(new_feature_name)
        if abs(correlation_growth) > 0.1 and new_feature_growth_name not in added_features:
            new_feature_correlations.append((new_feature_growth_name, correlation_growth))
            added_features.add(new_feature_growth_name)

# Concatenate new features to the original DataFrame
features = pd.concat([features, pd.DataFrame(new_features_dict)], axis=1)

# Print the new features with high correlations
for feature, corr in new_feature_correlations:
    print(f'Feature: {feature}, Correlation: {corr}')


Feature: Basel Temperature [2 m elevation corrected]_x_Basel Temperature [2 m elevation corrected], Correlation: 0.2958803126640731
Feature: Basel Temperature [2 m elevation corrected]_x_Basel Wind Speed [10 m], Correlation: 0.16492811352292927
Feature: Basel Temperature [2 m elevation corrected]_x_660n_sumPW, Correlation: 0.11947271525695306
Feature: Basel Temperature [2 m elevation corrected]_x_Gasverbrauch, Correlation: -0.33163000210476223
Feature: Basel Temperature [2 m elevation corrected]_x_Traffic, Correlation: 0.1068350172287397
Feature: Basel Temperature [2 m elevation corrected]_x_o3_stundenmittelwerte_ug_m3, Correlation: 0.2709104988723351
Feature: Basel Temperature [2 m elevation corrected]_x_pm10_stundenmittelwerte_ug_m3, Correlation: -0.2476758616164491
Feature: Basel Temperature [2 m elevation corrected]_x_pm2_5_stundenmittelwerte_ug_m3, Correlation: -0.3062123098646689
Feature: Basel Temperature [2 m elevation corrected]_x_O3 [ug/m3], Correlation: 0.26460780284818264
F

**Part 3** Selecting the 50 most important features for Model stability

In [18]:
#reimport the data to solve dependencies
working_dataframe = pd.read_csv('../04_WorkingDatasets/NormalData/Working_DataFrame.csv')
target_dataframe = pd.read_csv('../04_WorkingDatasets/NormalData/Target_Additive.csv')

In [19]:
from sklearn.feature_selection import SelectKBest, f_regression

# Select only numeric columns from working_dataframe
working_dataframe_numeric = working_dataframe.select_dtypes(include=[float, int])

# Add 'Datum' column back if it exists in the original dataframe
if 'Datum' in working_dataframe.columns:
    working_dataframe_numeric['Datum'] = working_dataframe['Datum']

# Define the feature columns and target column
feature_columns = working_dataframe_numeric.columns.tolist()
if 'Datum' in feature_columns:
    feature_columns.remove('Datum')
target_column = 'PM10_Combined_Trend_Residual'

# Align the indices of working_dataframe_numeric and target_dataframe
aligned_data = working_dataframe_numeric.join(target_dataframe[target_column], how='inner')

# Prepare the data
X = aligned_data[feature_columns]
y = aligned_data[target_column]

# Initialize SelectKBest with the scoring function and the desired number of features
selector = SelectKBest(score_func=f_regression, k=50)

# Fit the selector
selector.fit(X, y)

# Get the selected features
selected_features = [feature for feature, support in zip(feature_columns, selector.get_support()) if support]

# Create a new dataframe with the selected features
if 'Datum' in working_dataframe.columns:
    reduced_working_dataframe = working_dataframe_numeric[selected_features + ['Datum']]
else:
    reduced_working_dataframe = working_dataframe_numeric[selected_features]

print("Selected features:", selected_features)
print("Reduced dataframe shape:", reduced_working_dataframe.shape)
print("Selected features as list:", selected_features)
for feature in selected_features:
    print(feature)


Selected features: ['Basel Temperature [2 m elevation corrected]', 'Basel Precipitation Total', 'Basel Wind Speed [10 m]', 'Basel Wind Direction [10 m]', '350v_sumLW', '352v_sumPW', '352v_sumLief', '402v_sumPW', '402n_sumLief', '402n_sumLW', '403v_sumPW', '403v_sumLW', '405v_sumLief', '405v_sumLW', '406n_sumLW', '406v_sumLief', '406v_sumLW', '408n_sumPW', '408n_sumLief', '408n_sumLW', '415v_sumLW', '417n_sumPW', '417n_sumLief', '417n_sumLW', '419v_sumLW', '419n_sumPW', '419n_sumLief', '420n_sumLW', '653n_sumLW', '659v_sumLW', '659n_sumLW', '660v_sumPW', '660n_sumPW', '84111104n_sumLief', '84111104v_sumLief', 'Gasverbrauch', 'Traffic', 'Stromverbrauch', 'pm10_stundenmittelwerte_ug_m3', 'pm2_5_stundenmittelwerte_ug_m3', 'no2_stundenmittelwerte_ug_m3', 'O3 [ug/m3]', 'NO2 [ug/m3]', 'PM10 [ug/m3]', 'PM2.5 [ug/m3]', 'CPC [1/cm3]', 'PREC [mm]', 'SO2 [ug/m3]', 'NOX [ug/m3 eq. NO2]', 'TEMP [C]']
Reduced dataframe shape: (24574, 51)
Selected features as list: ['Basel Temperature [2 m elevation c

In [20]:
# Reorder columns to make 'Datum' the first column
columns = ['Datum'] + [col for col in reduced_working_dataframe.columns if col != 'Datum']
reduced_working_dataframe = reduced_working_dataframe[columns]

# Export the reduced working dataframe with the most relevant features to a CSV file
reduced_working_dataframe.to_csv('../04_WorkingDatasets/Top50Data/50MostImpFeatures_DF.csv', index=False)




# Align the target dataframe with the reduced working dataframe
aligned_target_dataframe = target.loc[reduced_working_dataframe.index]

# Export the aligned target dataframe to a CSV file
aligned_target_dataframe.to_csv('../04_WorkingDatasets/Top50Data/TargetCutto50MostImpFeatures_DF.csv', index=False)


# Getting also the top ten features

In [21]:
from sklearn.feature_selection import SelectKBest, f_regression

# Select only numeric columns from working_dataframe
working_dataframe_numeric = working_dataframe.select_dtypes(include=[float, int])

# Add 'Datum' column back if it exists in the original dataframe
if 'Datum' in working_dataframe.columns:
    working_dataframe_numeric['Datum'] = working_dataframe['Datum']

# Define the feature columns and target column
feature_columns = working_dataframe_numeric.columns.tolist()
if 'Datum' in feature_columns:
    feature_columns.remove('Datum')
target_column = 'PM10_Combined_Trend_Residual'

# Align the indices of working_dataframe_numeric and target_dataframe
aligned_data = working_dataframe_numeric.join(target_dataframe[target_column], how='inner')

# Prepare the data
X = aligned_data[feature_columns]
y = aligned_data[target_column]

# Initialize SelectKBest with the scoring function and the desired number of features
selector = SelectKBest(score_func=f_regression, k=50)

# Fit the selector
selector.fit(X, y)

# Get the scores and selected features
scores = selector.scores_
selected_features = [feature for feature, support in zip(feature_columns, selector.get_support()) if support]
selected_scores = [score for score, support in zip(scores, selector.get_support()) if support]

# Sort the selected features by their scores in descending order
sorted_features_scores = sorted(zip(selected_features, selected_scores), key=lambda x: x[1], reverse=True)
sorted_features = [feature for feature, score in sorted_features_scores]

# Create a new dataframe with the selected features
if 'Datum' in working_dataframe.columns:
    reduced_working_dataframe = working_dataframe_numeric[sorted_features + ['Datum']]
else:
    reduced_working_dataframe = working_dataframe_numeric[sorted_features]

print("Selected features (ordered by score):", sorted_features)
print("Reduced dataframe shape:", reduced_working_dataframe.shape)
print("Selected features as list (ordered by score):", sorted_features)
for feature in sorted_features:
    print(feature)

    # Add the score to each feature
    features_with_scores = {feature: score for feature, score in sorted_features_scores}

    print("Selected features with scores (ordered by score):")
    for feature, score in features_with_scores.items():
        print(f"{feature}: {score}")
# Reorder columns to make 'Datum' the first column
columns = ['Datum'] + [col for col in reduced_working_dataframe.columns if col != 'Datum']
reduced_working_dataframe = reduced_working_dataframe[columns]

# Export the reduced working dataframe with the most relevant features to a CSV file
#reduced_working_dataframe.to_csv('50MostImpFeatures_DF.csv', index=False)              -> war oben schonmal ????????????????????
# Align the target dataframe with the reduced working dataframe
aligned_target_dataframe = target_dataframe.loc[reduced_working_dataframe.index]

# Export the aligned target dataframe to a CSV file
#aligned_target_dataframe.to_csv('TargetCutto50MostImpFeatures_DF.csv', index=False)    -> war oben schonmal ????????????????????
top_10 = working_dataframe.copy()
top_10_features = sorted_features[:10]
print("Top 10 features:", top_10_features)
top_10_good = top_10.iloc[:, top_10.columns.isin(top_10_features)].copy()
top_10_good.head()

Selected features (ordered by score): ['pm10_stundenmittelwerte_ug_m3', 'pm2_5_stundenmittelwerte_ug_m3', 'PM10 [ug/m3]', 'PM2.5 [ug/m3]', 'NO2 [ug/m3]', 'NOX [ug/m3 eq. NO2]', 'no2_stundenmittelwerte_ug_m3', 'CPC [1/cm3]', 'Basel Wind Direction [10 m]', 'Basel Wind Speed [10 m]', 'Gasverbrauch', 'Basel Precipitation Total', 'SO2 [ug/m3]', 'O3 [ug/m3]', 'TEMP [C]', 'Basel Temperature [2 m elevation corrected]', '408n_sumPW', 'PREC [mm]', '84111104n_sumLief', '408n_sumLief', '84111104v_sumLief', 'Stromverbrauch', '417n_sumPW', '408n_sumLW', '402v_sumPW', '403v_sumLW', '405v_sumLW', '402n_sumLW', '403v_sumPW', '406v_sumLW', '660n_sumPW', '660v_sumPW', '350v_sumLW', '417n_sumLW', '352v_sumPW', '415v_sumLW', '417n_sumLief', '419n_sumLief', '352v_sumLief', '420n_sumLW', '402n_sumLief', 'Traffic', '659n_sumLW', '406v_sumLief', '653n_sumLW', '659v_sumLW', '419n_sumPW', '406n_sumLW', '405v_sumLief', '419v_sumLW']
Reduced dataframe shape: (24574, 51)
Selected features as list (ordered by score)

Unnamed: 0,Basel Wind Speed [10 m],Basel Wind Direction [10 m],pm10_stundenmittelwerte_ug_m3,pm2_5_stundenmittelwerte_ug_m3,no2_stundenmittelwerte_ug_m3,NO2 [ug/m3],PM10 [ug/m3],PM2.5 [ug/m3],CPC [1/cm3],NOX [ug/m3 eq. NO2]
0,-0.694066,0.115891,6.918893,6.451691,-0.270897,0.366536,2.480706,3.674428,5.133522,0.643267
1,-0.918024,0.047361,5.648933,5.400365,-0.332234,0.503186,5.359952,6.923201,6.721626,1.005931
2,-0.857675,-0.143112,4.821976,4.679175,-0.372557,0.357426,8.239199,10.615638,6.023355,1.055954
3,-0.831118,-0.006355,3.171357,3.266025,-0.37397,0.330097,5.405474,7.209436,4.554046,0.98092
4,-0.68775,-0.148316,2.687692,2.827438,-0.383682,0.721826,3.834976,5.119917,2.874267,0.799588


In [22]:
MostImp = reduced_working_dataframe

In [23]:

# Example DataFrame (replace with your actual DataFrame)
df = features
dfa = pd.DataFrame()
# Define feature pairs and their names
feature_pairs = [
    ("Basel Temperature [2 m elevation corrected]", "Basel Temperature [2 m elevation corrected]"),
    ("Basel Temperature [2 m elevation corrected]", "Basel Wind Speed [10 m]"),
    ("Basel Temperature [2 m elevation corrected]", "660n_sumPW"),
    ("Basel Temperature [2 m elevation corrected]", "Gasverbrauch"),
    ("Basel Temperature [2 m elevation corrected]", "Traffic"),
    ("Basel Precipitation Total", "Hour"),
    ("Basel Wind Speed [10 m]", "Basel Wind Speed [10 m]"),
    ("Basel Wind Speed [10 m]", "Basel Wind Direction [10 m]"),
    ("Basel Wind Speed [10 m]", "Gasverbrauch"),
    ("Basel Wind Speed [10 m]", "Hour"),
    ("Basel Wind Direction [10 m]", "Basel Wind Direction [10 m]"),
    ("Basel Wind Direction [10 m]", "Hour"),
    ("406v_sumLW", "408n_sumLief"),
    ("660n_sumPW", "Gasverbrauch"),
    ("Gasverbrauch", "Gasverbrauch"),
    ("Gasverbrauch", "Hour"),
]

# Loop through the pairs, calculate the product, and add to the DataFrame
for feature1, feature2 in feature_pairs:
    new_feature_name = f"{feature1}_x_{feature2}"
    dfa[new_feature_name] = df[feature1] * df[feature2]

# Save or view the updated DataFrame
# df.to_csv('updated_data.csv', index=False)
print(dfa.head())

print(f'Size of dfa: {dfa.shape}')


   Basel Temperature [2 m elevation corrected]_x_Basel Temperature [2 m elevation corrected]  \
0                                           0.858340                                           
1                                           1.085695                                           
2                                           1.351255                                           
3                                           1.288480                                           
4                                           1.354144                                           

   Basel Temperature [2 m elevation corrected]_x_Basel Wind Speed [10 m]  \
0                                           0.643029                       
1                                           0.956551                       
2                                           0.996992                       
3                                           0.943412                       
4                                          

In [24]:

# Concatenate dfa to MostImp
MostImp_combined = pd.concat([MostImp, dfa], axis=1)

# Standardize the features
scaler = StandardScaler()
MostImp_combined_scaled = pd.DataFrame(scaler.fit_transform(MostImp_combined.iloc[:, 1:]), columns=MostImp_combined.columns[1:])
MostImp_combined_scaled.insert(0, 'Datum', MostImp_combined['Datum'])

# Export the combined DataFrame to a CSV file
MostImp_combined_scaled.to_csv('../04_WorkingDatasets/Top50Combined/50MostImp+CombinedFeatures.csv', index=False)

# Creating Lagged Values from the Hours we know

In [25]:
import pandas as pd
features = pd.read_csv('../04_WorkingDatasets/Top50Combined/50MostImp+CombinedFeatures.csv')
AllFeatures = pd.read_csv('../04_WorkingDatasets/NormalData/Working_DataFrame.csv')
target = pd.read_csv('../04_WorkingDatasets/NormalData/Target_Additive.csv')

In [26]:
# Function to replace outliers using IQR with moving average
def replace_outliers_with_moving_avg(df, column, window=3):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Calculate the moving average
    df['moving_avg'] = df[column].rolling(window=window, center=True).mean()
    
    # Replace outliers with the moving average
    df[column] = df.apply(lambda row: row['moving_avg'] if row[column] < lower_bound or row[column] > upper_bound else row[column], axis=1)
    
    # Drop the temporary moving average column
    df.drop(columns=['moving_avg'], inplace=True)
    
    return df

# Replace outliers in the target dataset
target = replace_outliers_with_moving_avg(target, 'PM10_Combined_Trend_Residual')

# Ensure the features and target datasets are aligned
features = features[features['Datum'].isin(target['Datum'])]
target = target[target['Datum'].isin(features['Datum'])]
AllFeatures = AllFeatures[AllFeatures['Datum'].isin(target['Datum'])]

print(target.head())


                       Datum  PM10_Combined_Trend_Residual
0  2022-01-01 00:00:00+00:00                           NaN
1  2022-01-01 01:00:00+00:00                     53.126838
2  2022-01-01 02:00:00+00:00                     36.328107
3  2022-01-01 03:00:00+00:00                     24.801767
4  2022-01-01 04:00:00+00:00                      9.683660


In [27]:
print(features.columns)
print(AllFeatures.columns)

Index(['Datum', 'pm10_stundenmittelwerte_ug_m3',
       'pm2_5_stundenmittelwerte_ug_m3', 'PM10 [ug/m3]', 'PM2.5 [ug/m3]',
       'NO2 [ug/m3]', 'NOX [ug/m3 eq. NO2]', 'no2_stundenmittelwerte_ug_m3',
       'CPC [1/cm3]', 'Basel Wind Direction [10 m]', 'Basel Wind Speed [10 m]',
       'Gasverbrauch', 'Basel Precipitation Total', 'SO2 [ug/m3]',
       'O3 [ug/m3]', 'TEMP [C]', 'Basel Temperature [2 m elevation corrected]',
       '408n_sumPW', 'PREC [mm]', '84111104n_sumLief', '408n_sumLief',
       '84111104v_sumLief', 'Stromverbrauch', '417n_sumPW', '408n_sumLW',
       '402v_sumPW', '403v_sumLW', '405v_sumLW', '402n_sumLW', '403v_sumPW',
       '406v_sumLW', '660n_sumPW', '660v_sumPW', '350v_sumLW', '417n_sumLW',
       '352v_sumPW', '415v_sumLW', '417n_sumLief', '419n_sumLief',
       '352v_sumLief', '420n_sumLW', '402n_sumLief', 'Traffic', '659n_sumLW',
       '406v_sumLief', '653n_sumLW', '659v_sumLW', '419n_sumPW', '406n_sumLW',
       '405v_sumLief', '419v_sumLW',
       'Basel

In [28]:
AllFeatures.head()

AllFeaturescopy = AllFeatures.copy()

In [29]:
from sklearn.preprocessing import StandardScaler
from traitlets import All

# Create lagged target variables
features['PM10_1h_lag'] = target['PM10_Combined_Trend_Residual'].shift(1)
features['PM10_2h_lag'] = target['PM10_Combined_Trend_Residual'].shift(2)
features['PM10_24h_lag'] = target['PM10_Combined_Trend_Residual'].shift(24)
AllFeatures['PM10_1h_lag'] = target['PM10_Combined_Trend_Residual'].shift(1)
AllFeatures['PM10_2h_lag'] = target['PM10_Combined_Trend_Residual'].shift(2)
AllFeatures['PM10_24h_lag'] = target['PM10_Combined_Trend_Residual'].shift(24)

# Drop rows with NaN values
features.dropna(inplace=True)
AllFeatures.dropna(inplace=True)
target = target[target['Datum'].isin(features['Datum'])]
target = target[target['Datum'].isin(AllFeatures['Datum'])]

features.drop(columns=['Datum'], inplace=True)
AllFeatures.drop(columns=['Datum'], inplace=True)

# Standardize the features dataset
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
AllFeatures_scaled = pd.DataFrame(scaler.fit_transform(AllFeatures), columns=AllFeatures.columns)

# Ensure there are no NaN values in the target dataset
target = target.dropna()

features_scaled.insert(0, 'Datum', target['Datum'])
AllFeatures_scaled.insert(0, 'Datum', AllFeaturescopy['Datum'])



print(features_scaled.columns)
print(AllFeatures_scaled.columns)

Index(['Datum', 'pm10_stundenmittelwerte_ug_m3',
       'pm2_5_stundenmittelwerte_ug_m3', 'PM10 [ug/m3]', 'PM2.5 [ug/m3]',
       'NO2 [ug/m3]', 'NOX [ug/m3 eq. NO2]', 'no2_stundenmittelwerte_ug_m3',
       'CPC [1/cm3]', 'Basel Wind Direction [10 m]', 'Basel Wind Speed [10 m]',
       'Gasverbrauch', 'Basel Precipitation Total', 'SO2 [ug/m3]',
       'O3 [ug/m3]', 'TEMP [C]', 'Basel Temperature [2 m elevation corrected]',
       '408n_sumPW', 'PREC [mm]', '84111104n_sumLief', '408n_sumLief',
       '84111104v_sumLief', 'Stromverbrauch', '417n_sumPW', '408n_sumLW',
       '402v_sumPW', '403v_sumLW', '405v_sumLW', '402n_sumLW', '403v_sumPW',
       '406v_sumLW', '660n_sumPW', '660v_sumPW', '350v_sumLW', '417n_sumLW',
       '352v_sumPW', '415v_sumLW', '417n_sumLief', '419n_sumLief',
       '352v_sumLief', '420n_sumLW', '402n_sumLief', 'Traffic', '659n_sumLW',
       '406v_sumLief', '653n_sumLW', '659v_sumLW', '419n_sumPW', '406n_sumLW',
       '405v_sumLief', '419v_sumLW',
       'Basel

In [30]:
AllFeatures_scaled.head()

Unnamed: 0,Datum,Rebgassechange,Clarahuuschange,Citychange,Storchenchange,Post Baselchange,Aeschenchange,Bahnhof Südchange,Bad. Bahnhofchange,Europechange,...,PM2.5 [ug/m3],CPC [1/cm3],PREC [mm],RAD [W/m2],SO2 [ug/m3],NOX [ug/m3 eq. NO2],TEMP [C],PM10_1h_lag,PM10_2h_lag,PM10_24h_lag
0,2022-01-01 00:00:00+00:00,-0.069723,-3.9e-05,-0.010908,-0.175172,8e-06,5e-06,-0.24517,-0.052815,-0.000129,...,1.069597,0.510222,-0.152269,-0.639512,0.295487,1.099745,-1.039634,0.096166,0.010001,5.637085
1,2022-01-01 01:00:00+00:00,0.000222,-3.9e-05,-1.8e-05,1.7e-05,8e-06,5e-06,4e-05,-0.000251,-0.000129,...,0.82605,0.216428,-0.152269,-0.639512,0.295487,0.83695,-1.052245,0.017932,0.096179,3.190429
2,2022-01-01 02:00:00+00:00,0.000222,-3.9e-05,-1.8e-05,-0.05838,8e-06,5e-06,0.490459,0.104878,-0.000129,...,0.754418,0.090215,-0.152269,-0.639512,0.168683,0.774379,-1.1153,-0.185601,0.017946,1.511672
3,2022-01-01 03:00:00+00:00,0.000222,-3.9e-05,0.010872,0.058413,8e-06,5e-06,0.24525,0.104878,0.150828,...,0.725766,0.447317,-0.152269,-0.639512,0.422291,0.993376,-1.216189,-0.296032,-0.185588,-0.690209
4,2022-01-01 04:00:00+00:00,0.140113,-3.9e-05,0.903873,-0.05838,0.382243,5e-06,4e-05,-0.157943,0.150828,...,0.697113,0.31655,-0.152269,-0.639512,0.295487,1.093488,-1.291856,-0.490289,-0.296019,-1.257621


In [31]:
target_mean = target['PM10_Combined_Trend_Residual'].mean()
print(f"The mean of the target variable is: {target_mean}")

The mean of the target variable is: 14.4243556847222


In [32]:
features_scaled.to_csv('../04_WorkingDatasets/Top50CombLagged/50CombLagged.csv', index=False)
AllFeatures_scaled.to_csv('../04_WorkingDatasets/NormalDataLaggedTarget/WDFTargetLagged.csv', index=False)
target.to_csv('../04_WorkingDatasets/Top50CombLagged/TargetOutliersTreated.csv', index=False)
target.to_csv('../04_WorkingDatasets/NormalDataLaggedTarget/TargetOutliersTreated.csv', index=False)

# Add the combined Features to the Normal DataFrames

In [33]:
NormalData_combined = pd.concat([final_df, dfa], axis=1)

scaler = StandardScaler()
MostImp_combined_scaled = pd.DataFrame(scaler.fit_transform(NormalData_combined.iloc[:, 1:]), columns=NormalData_combined.columns[1:])
MostImp_combined_scaled.insert(0, 'Datum', NormalData_combined['Datum'])

# Export the combined DataFrame to a CSV file
MostImp_combined_scaled.to_csv('../04_WorkingDatasets/01 NormalComb/NormalCombined.csv', index=False)


target_dataframe = target_dataframe[target_dataframe["Datum"] <= "2024-10-20 21:00:00+00:00"]
target_dataframe.to_csv("../04_WorkingDatasets/01 NormalComb/Target_Additive.csv", index=False)

#sonstige lösung 
# Konvertiere die Datumsspalte in einen datetime-Index (falls nicht bereits)
#target_data['Datum'] = pd.to_datetime(target_data['Datum'])
#features_data['Datum'] = pd.to_datetime(features_data['Datum'])

# Definiere das Cut-Off-Datum
#cutoff_date = pd.Timestamp('2024-10-20 21:00:00+00:00')

# Filtere das Dataset auf Einträge bis einschließlich des Cut-Off-Datums
#target_data_cutted = target_data[target_data['Datum'] <= cutoff_date]
#features_data_cutted = features_data[features_data['Datum'] <= cutoff_date]


In [34]:
NormalData_combined_Lagged = pd.concat([AllFeatures_scaled, dfa], axis=1)

scaler = StandardScaler()
MostImp_combined_scaled = pd.DataFrame(scaler.fit_transform(NormalData_combined_Lagged.iloc[:, 1:]), columns=NormalData_combined_Lagged.columns[1:])
MostImp_combined_scaled.insert(0, 'Datum', NormalData_combined_Lagged['Datum'])

# Export the combined DataFrame to a CSV file
MostImp_combined_scaled.to_csv('../04_WorkingDatasets/02 NormalCombLagged/WDFCombLagged.csv', index=False)
target.to_csv('../04_WorkingDatasets/02 NormalCombLagged/TargetOutliersTreated.csv', index=False)
