In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ARDRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
import plotly.graph_objects as go

data_model = pd.read_csv("../Data for modelling/data_model.csv")
data_model.head()
#data_model["LC_TEMP_QCL3"]


Unnamed: 0,month,day,hour,description,lamax,laeq,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,...,weekday,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,1,1,0,MP 03: Naamsestraat 62 Taste,60.322528,57.126833,3e-06,0.00036,-18.197324,0.389565,...,6,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,1,1,0,MP 05: Calvariekapel KU Leuven,53.230972,49.987639,3e-06,0.00036,-18.197324,0.389565,...,6,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,1,1,0,MP 06: Parkstraat 2 La Filosovia,53.666056,50.752,3e-06,0.00036,-18.197324,0.389565,...,6,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,1,1,0,MP 07: Naamsestraat 81,50.056861,47.440222,3e-06,0.00036,-18.197324,0.389565,...,6,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,1,1,1,MP 03: Naamsestraat 62 Taste,53.033583,50.853806,7e-06,0.0,-16.227891,0.222602,...,6,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


<h4>Print the data</h4>

In [36]:
#see the impact on the coordinate coefficient if vrijhof is removed as this is a kind of outlier
#having a lower noise than the rest
print(len(data_model))
# Drop rows where "description" is "vrijthof"
#data_model = data_model.drop(data_model[data_model["description"] == "MP08bis - Vrijthof"].index)
print(data_model.head())
print(len(data_model))

55469
   month  day  hour                       description      lamax       laeq   
0      1    1     0      MP 03: Naamsestraat 62 Taste  60.322528  57.126833  \
1      1    1     0    MP 05: Calvariekapel KU Leuven  53.230972  49.987639   
2      1    1     0  MP 06: Parkstraat 2 La Filosovia  53.666056  50.752000   
3      1    1     0            MP 07: Naamsestraat 81  50.056861  47.440222   
4      1    1     1      MP 03: Naamsestraat 62 Taste  53.033583  50.853806   

   LC_RAININ  LC_DAILYRAIN  LC_WINDDIR  LC_WINDSPEED  ...  weekday   
0   0.000003       0.00036  -18.197324      0.389565  ...        6  \
1   0.000003       0.00036  -18.197324      0.389565  ...        6   
2   0.000003       0.00036  -18.197324      0.389565  ...        6   
3   0.000003       0.00036  -18.197324      0.389565  ...        6   
4   0.000007       0.00000  -16.227891      0.222602  ...        6   

   coordinate  sin_month  cos_month   sin_day  cos_day  sin_hour  cos_hour   
0    0.414698       

<h2>Drop variables that are not needed anymore</h2>

In [37]:
# List of column names to drop
columns_to_drop = ['month', 'day','hour','description','lamax','LC_RAININ','weekday']

# Drop the specified columns
data_model = data_model.drop(columns=columns_to_drop)
data_model.head()

Unnamed: 0,laeq,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_TEMP_QCL3,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


<h2>Rename some variables</h2>

In [38]:
# Dictionary mapping old column names to new column names
column_mapping = {'LC_DAILYRAIN': 'rain',
                  'LC_WINDDIR': 'winddir',
                  'LC_WINDSPEED': 'windspeed',
                  'LC_TEMP_QCL3': 'temperature',}

# Rename the columns
data_model = data_model.rename(columns=column_mapping)
data_model.head()

Unnamed: 0,laeq,rain,winddir,windspeed,temperature,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


<h1>First training with all the data</h1>

In [39]:
#print(data_model.head())

# Initialize StandardScaler
scaler = StandardScaler()

# Fit StandardScaler to the DataFrame
#scaler.fit(data_model)

# Transform the DataFrame to standardized values
data_model_std= scaler.fit_transform(data_model)

# Convert standardized values back to DataFrame with column names
data_model_std = pd.DataFrame(data_model_std, columns=data_model.columns)

print(data_model_std)

           laeq      rain   winddir  windspeed  temperature  coordinate   
0      1.044063 -0.279749 -0.657413   0.363040     0.035393    0.142791  \
1     -0.103638 -0.279749 -0.657413   0.363040     0.035393   -0.731170   
2      0.019241 -0.279749 -0.657413   0.363040     0.035393   -0.965436   
3     -0.513162 -0.279749 -0.657413   0.363040     0.035393   -1.131256   
4      0.035608 -0.385076 -0.561456  -0.168306    -0.024273    0.142791   
...         ...       ...       ...        ...          ...         ...   
55464  1.344533  0.251926 -1.192926   3.787333     0.404940   -0.276136   
55465  0.970938  0.251926 -1.192926   3.787333     0.404940   -0.731170   
55466  0.761105  0.251926 -1.192926   3.787333     0.404940   -0.965436   
55467  0.398874  0.251926 -1.192926   3.787333     0.404940   -1.131256   
55468  1.222563  0.251926 -1.192926   3.787333     0.404940    1.940973   

       sin_month  cos_month   sin_day   cos_day  sin_hour  cos_hour   
0        0.79606   1.281637 

<h2>Split in X and Y for the complete dataset</h2>

In [40]:
# Extract the feature columns (X_train)
X_data = data_model_std.drop("laeq", axis=1)

# Extract the target variable column (Y_train)
Y_data = data_model_std["laeq"]

print(X_data)
print("training data")
print(Y_data.head())

           rain   winddir  windspeed  temperature  coordinate  sin_month   
0     -0.279749 -0.657413   0.363040     0.035393    0.142791    0.79606  \
1     -0.279749 -0.657413   0.363040     0.035393   -0.731170    0.79606   
2     -0.279749 -0.657413   0.363040     0.035393   -0.965436    0.79606   
3     -0.279749 -0.657413   0.363040     0.035393   -1.131256    0.79606   
4     -0.385076 -0.561456  -0.168306    -0.024273    0.142791    0.79606   
...         ...       ...        ...          ...         ...        ...   
55464  0.251926 -1.192926   3.787333     0.404940   -0.276136    0.07345   
55465  0.251926 -1.192926   3.787333     0.404940   -0.731170    0.07345   
55466  0.251926 -1.192926   3.787333     0.404940   -0.965436    0.07345   
55467  0.251926 -1.192926   3.787333     0.404940   -1.131256    0.07345   
55468  0.251926 -1.192926   3.787333     0.404940    1.940973    0.07345   

       cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday   
0       1.2816

<h2>Training with all the data</h2>

In [42]:
# Initialize ARDRegression
ard_reg = ARDRegression()

# Fit the model to the training data
ard_reg.fit(X_data, Y_data)

# Make predictions on new data (X_test)
#Y_pred = ard_reg.predict(X_test)

# Map coefficients to column names
coefficients = pd.Series(ard_reg.coef_, index=X_data.columns)

# Print the coefficients and corresponding column names
for feature, coefficient in coefficients.items():
    print(f"Feature: {feature}, Coefficient: {round(coefficient,2)}")

Feature: rain, Coefficient: 0.06
Feature: winddir, Coefficient: 0.05
Feature: windspeed, Coefficient: 0.05
Feature: temperature, Coefficient: -0.01
Feature: coordinate, Coefficient: -0.15
Feature: sin_month, Coefficient: 0.02
Feature: cos_month, Coefficient: 0.02
Feature: sin_day, Coefficient: -0.02
Feature: cos_day, Coefficient: 0.0
Feature: sin_hour, Coefficient: -0.37
Feature: cos_hour, Coefficient: -0.41
Feature: sin_weekday, Coefficient: -0.04
Feature: cos_weekday, Coefficient: -0.19


<h1>Work correctly with training and test data</h1>

<h3>Split the data and normalize the training data</h3>

In [45]:
#see how good it is for predictions

# Extract the feature columns (X_train)
X_data_true = data_model.drop("laeq", axis=1)

# Extract the target variable column (Y_train)
Y_data_true = data_model["laeq"]

print(X_data_true)
print("training data")
print(Y_data_true.head())


           rain    winddir  windspeed  temperature  coordinate     sin_month   
0      0.000360 -18.197324   0.389565    13.100358    0.414698  5.000000e-01  \
1      0.000360 -18.197324   0.389565    13.100358    0.130227  5.000000e-01   
2      0.000360 -18.197324   0.389565    13.100358    0.053974  5.000000e-01   
3      0.000360 -18.197324   0.389565    13.100358    0.000000  5.000000e-01   
4      0.000000 -16.227891   0.222602    12.669197    0.414698  5.000000e-01   
...         ...        ...        ...          ...         ...           ...   
55464  0.002174 -29.188272   1.465571    15.770757    0.278339 -2.449294e-16   
55465  0.002174 -29.188272   1.465571    15.770757    0.130227 -2.449294e-16   
55466  0.002174 -29.188272   1.465571    15.770757    0.053974 -2.449294e-16   
55467  0.002174 -29.188272   1.465571    15.770757    0.000000 -2.449294e-16   
55468  0.002174 -29.188272   1.465571    15.770757    1.000000 -2.449294e-16   

       cos_month       sin_day  cos_day

<h2>Split the data randomly</h2>

In [46]:
#to get consistent results if we want to redo it
np.random.seed(7)

#split the data with the raw values
#normalize afterwards only the training data
X_train,X_test, Y_train,Y_test = train_test_split(X_data_true,Y_data_true,test_size=0.2,random_state=1)

print(len(X_train))
print(len(X_test))

print(X_train.head())
print(Y_train.head())

44375
11094
           rain    winddir  windspeed  temperature  coordinate     sin_month   
20249  0.000000  -0.768519   0.003858     9.715740    0.414698  5.000000e-01  \
23563  0.004262   6.027778   0.151728    15.521177    0.000000  1.224647e-16   
53952  0.005821  -3.760802   0.392793     8.843176    0.130227 -2.449294e-16   
12931  0.000019  42.628086   0.511898    18.116083    1.000000  8.660254e-01   
15233  0.000019  -4.378086   0.061420     6.687049    0.000000  5.000000e-01   

       cos_month       sin_day   cos_day  sin_hour      cos_hour   
20249  -0.866025 -2.449294e-16  1.000000  0.707107  7.071068e-01  \
23563  -1.000000 -7.431448e-01 -0.669131 -0.866025  5.000000e-01   
53952   1.000000 -8.978045e-01 -0.440394 -1.000000 -1.836970e-16   
12931  -0.500000 -4.067366e-01 -0.913545 -0.258819 -9.659258e-01   
15233  -0.866025  2.012985e-01  0.979530  1.000000  6.123234e-17   

        sin_weekday  cos_weekday  
20249  9.749279e-01    -0.222521  
23563 -2.449294e-16     1.00

<h2>Normalize the training data</h2>

In [47]:
#Normalize the training data

# Initialize StandardScaler
scaler_true = StandardScaler()

#merge training together
data_train = pd.concat([X_train, Y_train], axis=1)
print(data_train.head())

# Transform the DataFrame to standardized values
data_train_std= scaler_true.fit_transform(data_train)

print(data_train_std)


           rain    winddir  windspeed  temperature  coordinate     sin_month   
20249  0.000000  -0.768519   0.003858     9.715740    0.414698  5.000000e-01  \
23563  0.004262   6.027778   0.151728    15.521177    0.000000  1.224647e-16   
53952  0.005821  -3.760802   0.392793     8.843176    0.130227 -2.449294e-16   
12931  0.000019  42.628086   0.511898    18.116083    1.000000  8.660254e-01   
15233  0.000019  -4.378086   0.061420     6.687049    0.000000  5.000000e-01   

       cos_month       sin_day   cos_day  sin_hour      cos_hour   
20249  -0.866025 -2.449294e-16  1.000000  0.707107  7.071068e-01  \
23563  -1.000000 -7.431448e-01 -0.669131 -0.866025  5.000000e-01   
53952   1.000000 -8.978045e-01 -0.440394 -1.000000 -1.836970e-16   
12931  -0.500000 -4.067366e-01 -0.913545 -0.258819 -9.659258e-01   
15233  -0.866025  2.012985e-01  0.979530  1.000000  6.123234e-17   

        sin_weekday  cos_weekday       laeq  
20249  9.749279e-01    -0.222521  44.763000  
23563 -2.449294e-1

<h3>Put Xtrain and Ytrain again as dataframes</h3>

In [48]:
#recreate Xtrain and Ytrain that are standardized

# Get the column names
column_names = data_train.columns.tolist()

# Split the dataframe into X_train and Y_train
X_train = pd.DataFrame(data_train_std[:, :-1], columns=column_names[:-1])
Y_train = pd.DataFrame(data_train_std[:, -1], columns=[column_names[-1]])

# Print the resulting dataframes
print("X_train:")
print(X_train.head())
print("\nY_train:")
print(Y_train.head())

X_train:
       rain   winddir  windspeed  temperature  coordinate  sin_month   
0 -0.385137  0.188253  -0.862988    -0.435384    0.142369   0.802326  \
1  0.862013  0.519735  -0.391530     0.368459   -1.130261   0.079769   
2  1.318067  0.042307   0.377063    -0.556203   -0.730621   0.079769   
3 -0.379719  2.304877   0.756808     0.727760    1.938551   1.331274   
4 -0.379719  0.012200  -0.679463    -0.854749   -1.130261   0.802326   

   cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday  cos_weekday  
0  -1.129930 -0.002488  1.415789  1.000717  1.002132     1.378765    -0.313955  
1  -1.316606 -1.051742 -0.948590 -1.223858  0.709215    -0.000504     1.414308  
2   1.470131 -1.270107 -0.624578 -1.413312  0.002053     0.613329    -1.273069  
3  -0.619922 -0.576764 -1.294812 -0.365204 -1.364080    -0.000504     1.414308  
4  -1.129930  0.281727  1.386792  1.414899  0.002053    -0.000504     1.414308  

Y_train:
       laeq
0 -0.944112
1  0.378896
2  1.055779
3 -1.410079
4 

<h2> Normalize the test data </h2>

In [49]:
# Transform the test data using the trained scaler
data_test = pd.concat([X_test, Y_test], axis=1)
column_names_test = data_test.columns.tolist()

data_test_std = scaler_true.transform(data_test)

# Separate the test data into X_test and Y_test
X_test_std = data_test_std[:, :-1]
Y_test_std = data_test_std[:, -1]

# Convert X_test_std to a dataframe with original column names
X_test_std = pd.DataFrame(X_test_std, columns=column_names_test[:-1])

# Convert Y_test_std to a dataframe with original column name
Y_test_std = pd.DataFrame(Y_test_std, columns=[column_names[-1]])

print("X test")
print(X_test_std.head())
print("Y test")
print(Y_test_std.head())

X test
       rain   winddir  windspeed  temperature  coordinate  sin_month   
0 -0.374300  0.153930  -0.582632     0.054429    0.142369   0.802326  \
1 -0.225744  0.256145   0.454114     1.621930    0.548151  -1.171736   
2  0.318812  0.135640  -0.774227    -0.595478    0.142369   0.079769   
3 -0.374300  1.237721  -0.471582    -0.748449   -0.730621   1.331274   
4  0.945547 -0.869796  -0.304441     0.547842    0.925967   0.079769   

   cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday  cos_weekday  
0  -1.129930  1.020854  0.975200 -1.223858  0.709215     0.613329    -1.273069  
1  -0.619922  0.140352 -1.410010  0.000793 -1.412272     1.105584     0.882040  
2   1.470131 -1.270107 -0.624578  0.707846  1.226894     0.613329    -1.273069  
3  -0.619922  1.046766 -0.948590  0.000793  1.416378     1.105584     0.882040  
4  -1.316606 -1.225238  0.707522 -1.413312  0.002053    -1.106592     0.882040  
Y test
       laeq
0  0.790556
1 -0.081502
2 -0.462025
3 -1.852014
4  0.81

<h2> Fit the model </h2>

In [91]:
# Initialize ARDRegression
ard_reg = ARDRegression()

# Fit the model to the training data
ard_reg.fit(X_train, Y_train)

# Make predictions on new data (X_test)
Y_pred_std = ard_reg.predict(X_test_std)

# Map coefficients to column names
coefficients = pd.Series(ard_reg.coef_, index=X_train.columns)

coefficients.to_csv("ARD_coefficients_train_full_data_with_split")

# Print the coefficients and corresponding column names
for feature, coefficient in coefficients.items():
    print(f"Feature: {feature}, Coefficient: {round(coefficient,2)}")

Feature: rain, Coefficient: 0.06
Feature: winddir, Coefficient: 0.05
Feature: windspeed, Coefficient: 0.04
Feature: temperature, Coefficient: 0.0
Feature: coordinate, Coefficient: -0.15
Feature: sin_month, Coefficient: 0.02
Feature: cos_month, Coefficient: 0.03
Feature: sin_day, Coefficient: -0.02
Feature: cos_day, Coefficient: 0.01
Feature: sin_hour, Coefficient: -0.37
Feature: cos_hour, Coefficient: -0.41
Feature: sin_weekday, Coefficient: -0.04
Feature: cos_weekday, Coefficient: -0.19



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [59]:
print(len(Y_pred_std))
print(len(Y_test_std))

11094
11094


<h3> MSE and R2 score for complete data split</h3>

In [67]:
mse = mean_squared_error(Y_test_std["laeq"], Y_pred_std)
print(mse)

r2 = r2_score(Y_test_std["laeq"], Y_pred_std)
print(r2)

0.6025071741319603
0.3971750614993419


In [66]:
import matplotlib.pyplot as plt

# Assuming you have Y_test and Y_pred as the true and predicted target values

# Create a scatter plot
#plt.scatter(range(len(Y_test)), Y_test, color='b', label='True Values')
#plt.scatter(range(len(Y_pred)), Y_pred, color='r', label='Predicted Values')

# Create the scatter plot trace
scatter_trace = go.Scatter(
    x=Y_test_std["laeq"],
    y=Y_pred_std,
    mode='markers',
    marker=dict(
        color='blue',
        size=5,
    ),
    name='True Values'
)

# Create the layout
layout = go.Layout(
    xaxis=dict(title='True/target values'),
    yaxis=dict(title='Prediction Values'),
    title='True Values vs Predicted Values'
)

# Create the line trace
line_trace = go.Scatter(
    x=[-1.5, 1.5],
    y=[-1.5, 1.5],
    mode='lines',
    line=dict(color='red', width=2),
    name='Perfect prediction'
)

# Create the figure
figure = go.Figure(data=[scatter_trace,line_trace], layout=layout)

# Show the figure
figure.show()

In [12]:
print(Y_test[1000])
print(Y_pred[1000])

0.8440217250791616
-0.8456640636388458


<h3> Histogram of Y test values </h3>

In [64]:

# Assuming you have Y_test as the true target values

# Create the histogram trace
histogram_trace = go.Histogram(x=Y_test_std["laeq"])

# Create the layout
layout = go.Layout(title='Histogram of Y_test')

# Create the figure
figure = go.Figure(data=[histogram_trace], layout=layout)

# Show the figure
figure.show()

<h3> Try packed bubble chart </h3>

In [69]:
# Sample data for demonstration
feature_names = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5']
#coefficients = [0.4, -0.6, 0.8, -0.2, 0.3]  # Replace with your coefficients

# Create a trace for the packed bubbles chart
fig = go.Figure(data=go.Scatter(
    x=[0] * len(coefficients),
    y=[0] * len(coefficients),
    mode='markers',
    text=feature_names,
    marker=dict(
        size=np.abs(coefficients),
        sizemode='diameter',
        sizeref=max(np.abs(coefficients)) / 10,
        color=coefficients,
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Coefficient Value')
    )
))

# Set layout properties
fig.update_layout(
    title='Feature Importance Packed Bubbles Chart',
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    hovermode='closest',
)

# Show the plot
fig.show()


<h3> Horizontal bar chart </h3>

In [92]:
#print(coefficients)
#print(type(coefficients))
#print(coefficients[0])

fig = go.Figure()
fig.add_trace(go.Bar(
    x=coefficients.values, #["coefficient"],  # Use the coefficient as the length of the bar
    y=coefficients.index, #["feature"],   # Use the coefficient name as the y-axis
    orientation='h', # Specify horizontal orientation
    marker=dict(
        color=np.where(coefficients.values < 0, 'red', 'green')
    )
))

fig.update_layout(
    title='Feature importance with ARD regression',
    xaxis_title='Coefficient',
    yaxis_title='Feature name',
    width=400,  # Set the width of the plot to 800 pixels
    bargap=0.1,  # Set the gap between bars to 0.1 (adjust as needed)
    margin=dict(l=0, r=20, t=40, b=0)  # Set all margins to 0
)

fig.show()

In [29]:
import plotly.graph_objects as go
import random

# Data
features = [
    'rain', 'winddir', 'windspeed', 'temperature', 'coordinate',
    'sin_month', 'cos_month', 'sin_day', 'cos_day', 'sin_hour',
    'cos_hour', 'sin_weekday', 'cos_weekday'
]
coefficients = [
    0.06, 0.05, 0.04, 0.0, -0.15, 0.02, 0.03, -0.02, 0.01, -0.37, -0.41, -0.04, -0.19
]

# Generate random positions
random.seed(42)  # Set a seed for reproducibility
#positions = [(random.random(), random.random()) for _ in range(len(features))]

#print(positions)


# Calculate bubble positions
positions = np.arange(len(features))

# Create bubble chart
fig = go.Figure(data=go.Scatter(
    x=positions,
    y=np.zeros(len(features)),
    mode='markers',
    marker=dict(
        size=[abs(coef) for coef in coefficients],
        sizemode='diameter',
        sizeref=max([abs(coef) for coef in coefficients]) / 200,  # Adjust the scale of the bubble sizes
        sizemin=10,
        color=coefficients,
        colorscale=[[0, 'darkred'], [0.5, '#7209b7'], [1, 'navy']],  # Define custom colorscale
        showscale=True,
        colorbar=dict(
            title='Coefficient',
        )
    ),
    text=features
))

# Set layout
fig.update_layout(
    title='Importance of the features',
    xaxis=dict(
        title='Features',
        tickvals=positions,
        ticktext=features,
        tickfont=dict(size=14)  # Adjust tick font size and weight  , weight='bold'
    ),
    yaxis=dict(
        showticklabels=False,  # Remove y-axis tick labels
        showgrid=False,  # Remove y-axis gridlines
        zeroline=False  # Remove y-axis zero line
    )
)

# Show the chart
fig.show()

<h1> Without Vrijthof </h1>

<h2> Same analysis </h2>

In [30]:
#see the impact on the coordinate coefficient if vrijhof is removed as this is a kind of outlier
#having a lower noise than the rest
print(len(data_model))
# Drop rows where "description" is "vrijthof"
data_model2 = data_model.drop(data_model[data_model["description"] == "MP08bis - Vrijthof"].index)
print(data_model2.head())
print(len(data_model2))

55469
   month  day  hour                       description      lamax       laeq   
0      1    1     0      MP 03: Naamsestraat 62 Taste  60.322528  57.126833  \
1      1    1     0    MP 05: Calvariekapel KU Leuven  53.230972  49.987639   
2      1    1     0  MP 06: Parkstraat 2 La Filosovia  53.666056  50.752000   
3      1    1     0            MP 07: Naamsestraat 81  50.056861  47.440222   
4      1    1     1      MP 03: Naamsestraat 62 Taste  53.033583  50.853806   

   LC_RAININ  LC_DAILYRAIN  LC_WINDDIR  LC_WINDSPEED  ...  weekday   
0   0.000003       0.00036  -18.197324      0.389565  ...        6  \
1   0.000003       0.00036  -18.197324      0.389565  ...        6   
2   0.000003       0.00036  -18.197324      0.389565  ...        6   
3   0.000003       0.00036  -18.197324      0.389565  ...        6   
4   0.000007       0.00000  -16.227891      0.222602  ...        6   

   coordinate  sin_month  cos_month   sin_day  cos_day  sin_hour  cos_hour   
0    0.414698       

<h2>Remove variables that are not needed </h2>

In [31]:
# List of column names to drop
columns_to_drop = ['month', 'day','hour','description','lamax','LC_RAININ','weekday']

# Drop the specified columns
data_model2 = data_model2.drop(columns=columns_to_drop)
data_model2.head()

Unnamed: 0,laeq,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_TEMP_QCL3,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


<h2> Rename some variables </h2>

In [32]:
# Dictionary mapping old column names to new column names
column_mapping = {'LC_DAILYRAIN': 'rain',
                  'LC_WINDDIR': 'winddir',
                  'LC_WINDSPEED': 'windspeed',
                  'LC_TEMP_QCL3': 'temperature',}

# Rename the columns
data_model2 = data_model2.rename(columns=column_mapping)
data_model2.head()

Unnamed: 0,laeq,rain,winddir,windspeed,temperature,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


<h2> Split in training and test set </h2>

In [33]:
#see how good it is for predictions

# Extract the feature columns (X_train)
X_data_true2 = data_model2.drop("laeq", axis=1)

# Extract the target variable column (Y_train)
Y_data_true2 = data_model2["laeq"]

print(X_data_true2)
print("training data")
print(Y_data_true2.head())

           rain    winddir  windspeed  temperature  coordinate     sin_month   
0      0.000360 -18.197324   0.389565    13.100358    0.414698  5.000000e-01  \
1      0.000360 -18.197324   0.389565    13.100358    0.130227  5.000000e-01   
2      0.000360 -18.197324   0.389565    13.100358    0.053974  5.000000e-01   
3      0.000360 -18.197324   0.389565    13.100358    0.000000  5.000000e-01   
4      0.000000 -16.227891   0.222602    12.669197    0.414698  5.000000e-01   
...         ...        ...        ...          ...         ...           ...   
55463  0.002174 -29.188272   1.465571    15.770757    0.414698 -2.449294e-16   
55464  0.002174 -29.188272   1.465571    15.770757    0.278339 -2.449294e-16   
55465  0.002174 -29.188272   1.465571    15.770757    0.130227 -2.449294e-16   
55466  0.002174 -29.188272   1.465571    15.770757    0.053974 -2.449294e-16   
55467  0.002174 -29.188272   1.465571    15.770757    0.000000 -2.449294e-16   

       cos_month       sin_day  cos_day

<h2> Split the data randomly </h2>

In [34]:
#to get consistent results if we want to redo it
np.random.seed(7)

#split the data with the raw values
#normalize afterwards only the training data
X_train,X_test, Y_train,Y_test = train_test_split(X_data_true2,Y_data_true2,test_size=0.2,random_state=1)

print(len(X_train))
print(len(X_test))

print(X_train.head())
print(Y_train.head())

39068
9767
           rain    winddir  windspeed  temperature  coordinate     sin_month   
636    0.004340 -35.503401   0.741310     3.845812    0.414698  5.000000e-01  \
16896  0.000000 -14.001543   0.070710    13.905629    0.414698  5.000000e-01   
15135  0.000056  -0.844136   0.261512    11.469308    0.000000  8.660254e-01   
20311  0.000000 -33.415123   0.387299    18.707305    0.546926  5.000000e-01   
20599  0.000000   0.362654   0.017361     7.666822    0.414698  1.224647e-16   

       cos_month       sin_day   cos_day      sin_hour  cos_hour  sin_weekday   
636     0.866025  9.884683e-01  0.151428 -7.071068e-01 -0.707107    -0.974928  \
16896  -0.866025  7.907757e-01 -0.612106  8.660254e-01  0.500000     0.433884   
15135  -0.500000 -2.449294e-16  1.000000 -8.660254e-01 -0.500000    -0.781831   
20311  -0.866025 -2.449294e-16  1.000000  1.224647e-16 -1.000000     0.974928   
20599  -1.000000  4.067366e-01  0.913545  9.659258e-01  0.258819    -0.433884   

       cos_weekday  


<h2> Normalize the data </h2>

In [35]:
#Normalize the training data

# Initialize StandardScaler
scaler_true = StandardScaler()

#merge training together
data_train = pd.concat([X_train, Y_train], axis=1)
print(data_train.head())

# Transform the DataFrame to standardized values
data_train_std= scaler_true.fit_transform(data_train)

print(data_train_std)

           rain    winddir  windspeed  temperature  coordinate     sin_month   
636    0.004340 -35.503401   0.741310     3.845812    0.414698  5.000000e-01  \
16896  0.000000 -14.001543   0.070710    13.905629    0.414698  5.000000e-01   
15135  0.000056  -0.844136   0.261512    11.469308    0.000000  8.660254e-01   
20311  0.000000 -33.415123   0.387299    18.707305    0.546926  5.000000e-01   
20599  0.000000   0.362654   0.017361     7.666822    0.414698  1.224647e-16   

       cos_month       sin_day   cos_day      sin_hour  cos_hour  sin_weekday   
636     0.866025  9.884683e-01  0.151428 -7.071068e-01 -0.707107    -0.974928  \
16896  -0.866025  7.907757e-01 -0.612106  8.660254e-01  0.500000     0.433884   
15135  -0.500000 -2.449294e-16  1.000000 -8.660254e-01 -0.500000    -0.781831   
20311  -0.866025 -2.449294e-16  1.000000  1.224647e-16 -1.000000     0.974928   
20599  -1.000000  4.067366e-01  0.913545  9.659258e-01  0.258819    -0.433884   

       cos_weekday       laeq  


<h2> Xtrain and Yrain again as dataframes </h2>

In [36]:
#recreate Xtrain and Ytrain that are standardized

# Get the column names
column_names = data_train.columns.tolist()

# Split the dataframe into X_train and Y_train
X_train = pd.DataFrame(data_train_std[:, :-1], columns=column_names[:-1])
Y_train = pd.DataFrame(data_train_std[:, -1], columns=[column_names[-1]])

# Print the resulting dataframes
print("X_train:")
print(X_train.head())
print("\nY_train:")
print(Y_train.head())

X_train:
       rain   winddir  windspeed  temperature  coordinate  sin_month   
0  0.893205 -1.483548   1.455789    -1.220280    0.542313   0.754613  \
1 -0.382406 -0.446622  -0.653239     0.168521    0.542313   0.754613   
2 -0.366077  0.187894  -0.053167    -0.167824   -1.166596   1.280187   
3 -0.382406 -1.382841   0.342431     0.831413    1.087204   0.754613   
4 -0.382406  0.246092  -0.821020    -0.692773    0.542313   0.036665   

   cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday  cos_weekday  
0   1.272153  1.394403  0.211768 -0.996405 -0.999734    -1.376432    -0.311352  
1  -1.147639  1.115441 -0.870439  1.227950  0.707682     0.614674    -1.271435  
2  -0.636277 -0.000413  1.414506 -1.221111 -0.706787    -1.103525     0.885851  
3  -1.147639 -0.000413  1.414506  0.003420 -1.414022     1.379345    -0.311352  
4  -1.334811  0.573528  1.291968  1.369206  0.366539    -0.611762    -1.271435  

Y_train:
       laeq
0  0.488709
1 -0.816138
2  0.223065
3  0.582802
4 

<h2> Normalize the test data </h2>

In [37]:
# Transform the test data using the trained scaler
data_test = pd.concat([X_test, Y_test], axis=1)
column_names_test = data_test.columns.tolist()

data_test_std = scaler_true.transform(data_test)

# Separate the test data into X_test and Y_test
X_test_std = data_test_std[:, :-1]
Y_test_std = data_test_std[:, -1]

# Convert X_test_std to a dataframe with original column names
X_test_std = pd.DataFrame(X_test_std, columns=column_names_test[:-1])

# Convert Y_test_std to a dataframe with original column name
Y_test_std = pd.DataFrame(Y_test_std, columns=[column_names[-1]])

print("X test")
print(X_test_std.head())
print("Y test")
print(Y_test_std.head())

X test
       rain   winddir  windspeed  temperature  coordinate  sin_month   
0  0.036689  0.090254  -0.802674    -1.715249    1.087204   0.036665  \
1 -0.382406  0.273776  -0.837376    -1.729776    1.594541   1.280187   
2 -0.373409  1.995458   0.412487    -0.704154    1.087204   1.472561   
3 -0.382406  0.801051  -0.289769     1.429151    0.542313  -1.206857   
4 -0.382406 -0.182947   0.009781    -0.664335    0.542313   1.280187   

   cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday  cos_weekday  
0   1.459324  1.365629 -0.358127  0.710403 -1.224519    -1.376432    -0.311352  
1  -0.636277  0.829004  1.143813  0.710403  1.225414     0.001456     1.418656  
2   0.062257 -0.919556 -1.078299 -0.996405  1.000628    -1.103525     0.885851  
3  -0.636277 -0.556883  1.299640 -1.221111 -0.706787     1.106437     0.885851  
4  -0.636277  0.829004  1.143813  0.710403 -1.224519     0.001456     1.418656  
Y test
       laeq
0  0.626733
1 -0.406423
2  0.059854
3  0.738278
4 -0.14

<h2> Fit the model </h2>

In [38]:
# Initialize ARDRegression
ard_reg = ARDRegression()

# Fit the model to the training data
ard_reg.fit(X_train, Y_train)

# Make predictions on new data (X_test)
Y_pred_std = ard_reg.predict(X_test_std)

# Map coefficients to column names
coefficients = pd.Series(ard_reg.coef_, index=X_train.columns)

coefficients.to_csv("ARD_coefficients_train_data_without_Vrijthof_with_split")

# Print the coefficients and corresponding column names
for feature, coefficient in coefficients.items():
    print(f"Feature: {feature}, Coefficient: {round(coefficient,2)}")

Feature: rain, Coefficient: 0.05
Feature: winddir, Coefficient: 0.04
Feature: windspeed, Coefficient: 0.05
Feature: temperature, Coefficient: -0.03
Feature: coordinate, Coefficient: 0.14
Feature: sin_month, Coefficient: 0.0
Feature: cos_month, Coefficient: 0.02
Feature: sin_day, Coefficient: -0.03
Feature: cos_day, Coefficient: 0.0
Feature: sin_hour, Coefficient: -0.43
Feature: cos_hour, Coefficient: -0.46
Feature: sin_weekday, Coefficient: -0.04
Feature: cos_weekday, Coefficient: -0.21



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



<h2>MSE and R2</h2>

In [39]:
mse = mean_squared_error(Y_test_std["laeq"], Y_pred_std)
print(mse)

r2 = r2_score(Y_test_std["laeq"], Y_pred_std)
print(r2)

0.5220515191065972
0.4861880228480083


<h2> Horizontal bar chart </h2>

In [40]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=coefficients.values, #["coefficient"],  # Use the coefficient as the length of the bar
    y=coefficients.index, #["feature"],   # Use the coefficient name as the y-axis
    orientation='h', # Specify horizontal orientation
    marker=dict(
        color=np.where(coefficients.values < 0, 'red', 'green')
    )
))

fig.update_layout(
    title='Feature importance with ARD regression',
    xaxis_title='Coefficient',
    yaxis_title='Feature name',
    width=400,  # Set the width of the plot to 800 pixels
    bargap=0.1,  # Set the gap between bars to 0.1 (adjust as needed)
    margin=dict(l=0, r=20, t=40, b=0)  # Set all margins to 0
)

fig.show()

<h1> Third analysis without the weekday variable </h1>

In [41]:
#see the impact on the coordinate coefficient if vrijhof is removed as this is a kind of outlier
#having a lower noise than the rest
print(len(data_model))
# Drop rows where "description" is "vrijthof"
data_model3 = data_model.drop(data_model[data_model["description"] == "MP08bis - Vrijthof"].index)
print(data_model3.head())
print(len(data_model3))

55469
   month  day  hour                       description      lamax       laeq   
0      1    1     0      MP 03: Naamsestraat 62 Taste  60.322528  57.126833  \
1      1    1     0    MP 05: Calvariekapel KU Leuven  53.230972  49.987639   
2      1    1     0  MP 06: Parkstraat 2 La Filosovia  53.666056  50.752000   
3      1    1     0            MP 07: Naamsestraat 81  50.056861  47.440222   
4      1    1     1      MP 03: Naamsestraat 62 Taste  53.033583  50.853806   

   LC_RAININ  LC_DAILYRAIN  LC_WINDDIR  LC_WINDSPEED  ...  weekday   
0   0.000003       0.00036  -18.197324      0.389565  ...        6  \
1   0.000003       0.00036  -18.197324      0.389565  ...        6   
2   0.000003       0.00036  -18.197324      0.389565  ...        6   
3   0.000003       0.00036  -18.197324      0.389565  ...        6   
4   0.000007       0.00000  -16.227891      0.222602  ...        6   

   coordinate  sin_month  cos_month   sin_day  cos_day  sin_hour  cos_hour   
0    0.414698       

<h2> Remove some variables that are not needed anymore</h2>

In [43]:
# List of column names to drop
columns_to_drop = ['month', 'day','hour','description','lamax','LC_RAININ','weekday','sin_weekday','cos_weekday']

# Drop the specified columns
data_model3 = data_model3.drop(columns=columns_to_drop)
data_model3.head()

Unnamed: 0,laeq,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_TEMP_QCL3,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926


<h2> Rename some variables </h2>

In [44]:
# Dictionary mapping old column names to new column names
column_mapping = {'LC_DAILYRAIN': 'rain',
                  'LC_WINDDIR': 'winddir',
                  'LC_WINDSPEED': 'windspeed',
                  'LC_TEMP_QCL3': 'temperature',}

# Rename the columns
data_model3 = data_model3.rename(columns=column_mapping)
data_model3.head()

Unnamed: 0,laeq,rain,winddir,windspeed,temperature,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926


<h2> Split X and Y data </h2>

In [45]:
#see how good it is for predictions

# Extract the feature columns (X_train)
X_data_true3 = data_model3.drop("laeq", axis=1)

# Extract the target variable column (Y_train)
Y_data_true3 = data_model3["laeq"]

print(X_data_true3)
print("training data")
print(Y_data_true3.head())

           rain    winddir  windspeed  temperature  coordinate     sin_month   
0      0.000360 -18.197324   0.389565    13.100358    0.414698  5.000000e-01  \
1      0.000360 -18.197324   0.389565    13.100358    0.130227  5.000000e-01   
2      0.000360 -18.197324   0.389565    13.100358    0.053974  5.000000e-01   
3      0.000360 -18.197324   0.389565    13.100358    0.000000  5.000000e-01   
4      0.000000 -16.227891   0.222602    12.669197    0.414698  5.000000e-01   
...         ...        ...        ...          ...         ...           ...   
55463  0.002174 -29.188272   1.465571    15.770757    0.414698 -2.449294e-16   
55464  0.002174 -29.188272   1.465571    15.770757    0.278339 -2.449294e-16   
55465  0.002174 -29.188272   1.465571    15.770757    0.130227 -2.449294e-16   
55466  0.002174 -29.188272   1.465571    15.770757    0.053974 -2.449294e-16   
55467  0.002174 -29.188272   1.465571    15.770757    0.000000 -2.449294e-16   

       cos_month       sin_day  cos_day

<h2> Split randomly in training and test data </h2>

In [47]:
#to get consistent results if we want to redo it
np.random.seed(7)

#split the data with the raw values
#normalize afterwards only the training data
X_train,X_test, Y_train,Y_test = train_test_split(X_data_true3,Y_data_true3,test_size=0.2,random_state=1)

print(len(X_train))
print(len(X_test))

print(X_train.head())
print(Y_train.head())

39068
9767
           rain    winddir  windspeed  temperature  coordinate     sin_month   
636    0.004340 -35.503401   0.741310     3.845812    0.414698  5.000000e-01  \
16896  0.000000 -14.001543   0.070710    13.905629    0.414698  5.000000e-01   
15135  0.000056  -0.844136   0.261512    11.469308    0.000000  8.660254e-01   
20311  0.000000 -33.415123   0.387299    18.707305    0.546926  5.000000e-01   
20599  0.000000   0.362654   0.017361     7.666822    0.414698  1.224647e-16   

       cos_month       sin_day   cos_day      sin_hour  cos_hour  
636     0.866025  9.884683e-01  0.151428 -7.071068e-01 -0.707107  
16896  -0.866025  7.907757e-01 -0.612106  8.660254e-01  0.500000  
15135  -0.500000 -2.449294e-16  1.000000 -8.660254e-01 -0.500000  
20311  -0.866025 -2.449294e-16  1.000000  1.224647e-16 -1.000000  
20599  -1.000000  4.067366e-01  0.913545  9.659258e-01  0.258819  
636      54.410454
16896    46.444472
15135    52.788722
20311    54.984885
20599    48.149917
Name: laeq,

<h2> Normalize the data </h2>

In [48]:
#Normalize the training data

# Initialize StandardScaler
scaler_true = StandardScaler()

#merge training together
data_train = pd.concat([X_train, Y_train], axis=1)
print(data_train.head())

# Transform the DataFrame to standardized values
data_train_std= scaler_true.fit_transform(data_train)

print(data_train_std)

           rain    winddir  windspeed  temperature  coordinate     sin_month   
636    0.004340 -35.503401   0.741310     3.845812    0.414698  5.000000e-01  \
16896  0.000000 -14.001543   0.070710    13.905629    0.414698  5.000000e-01   
15135  0.000056  -0.844136   0.261512    11.469308    0.000000  8.660254e-01   
20311  0.000000 -33.415123   0.387299    18.707305    0.546926  5.000000e-01   
20599  0.000000   0.362654   0.017361     7.666822    0.414698  1.224647e-16   

       cos_month       sin_day   cos_day      sin_hour  cos_hour       laeq  
636     0.866025  9.884683e-01  0.151428 -7.071068e-01 -0.707107  54.410454  
16896  -0.866025  7.907757e-01 -0.612106  8.660254e-01  0.500000  46.444472  
15135  -0.500000 -2.449294e-16  1.000000 -8.660254e-01 -0.500000  52.788722  
20311  -0.866025 -2.449294e-16  1.000000  1.224647e-16 -1.000000  54.984885  
20599  -1.000000  4.067366e-01  0.913545  9.659258e-01  0.258819  48.149917  
[[ 8.93205277e-01 -1.48354819e+00  1.45578927e+00 .

<h2> Xtrain and Ytrain as dataframes </h2>

In [49]:
#recreate Xtrain and Ytrain that are standardized

# Get the column names
column_names = data_train.columns.tolist()

# Split the dataframe into X_train and Y_train
X_train = pd.DataFrame(data_train_std[:, :-1], columns=column_names[:-1])
Y_train = pd.DataFrame(data_train_std[:, -1], columns=[column_names[-1]])

# Print the resulting dataframes
print("X_train:")
print(X_train.head())
print("\nY_train:")
print(Y_train.head())

X_train:
       rain   winddir  windspeed  temperature  coordinate  sin_month   
0  0.893205 -1.483548   1.455789    -1.220280    0.542313   0.754613  \
1 -0.382406 -0.446622  -0.653239     0.168521    0.542313   0.754613   
2 -0.366077  0.187894  -0.053167    -0.167824   -1.166596   1.280187   
3 -0.382406 -1.382841   0.342431     0.831413    1.087204   0.754613   
4 -0.382406  0.246092  -0.821020    -0.692773    0.542313   0.036665   

   cos_month   sin_day   cos_day  sin_hour  cos_hour  
0   1.272153  1.394403  0.211768 -0.996405 -0.999734  
1  -1.147639  1.115441 -0.870439  1.227950  0.707682  
2  -0.636277 -0.000413  1.414506 -1.221111 -0.706787  
3  -1.147639 -0.000413  1.414506  0.003420 -1.414022  
4  -1.334811  0.573528  1.291968  1.369206  0.366539  

Y_train:
       laeq
0  0.488709
1 -0.816138
2  0.223065
3  0.582802
4 -0.536782


<h2> Normalize the test data </h2>

In [50]:
# Transform the test data using the trained scaler
data_test = pd.concat([X_test, Y_test], axis=1)
column_names_test = data_test.columns.tolist()

data_test_std = scaler_true.transform(data_test)

# Separate the test data into X_test and Y_test
X_test_std = data_test_std[:, :-1]
Y_test_std = data_test_std[:, -1]

# Convert X_test_std to a dataframe with original column names
X_test_std = pd.DataFrame(X_test_std, columns=column_names_test[:-1])

# Convert Y_test_std to a dataframe with original column name
Y_test_std = pd.DataFrame(Y_test_std, columns=[column_names[-1]])

print("X test")
print(X_test_std.head())
print("Y test")
print(Y_test_std.head())

X test
       rain   winddir  windspeed  temperature  coordinate  sin_month   
0  0.036689  0.090254  -0.802674    -1.715249    1.087204   0.036665  \
1 -0.382406  0.273776  -0.837376    -1.729776    1.594541   1.280187   
2 -0.373409  1.995458   0.412487    -0.704154    1.087204   1.472561   
3 -0.382406  0.801051  -0.289769     1.429151    0.542313  -1.206857   
4 -0.382406 -0.182947   0.009781    -0.664335    0.542313   1.280187   

   cos_month   sin_day   cos_day  sin_hour  cos_hour  
0   1.459324  1.365629 -0.358127  0.710403 -1.224519  
1  -0.636277  0.829004  1.143813  0.710403  1.225414  
2   0.062257 -0.919556 -1.078299 -0.996405  1.000628  
3  -0.636277 -0.556883  1.299640 -1.221111 -0.706787  
4  -0.636277  0.829004  1.143813  0.710403 -1.224519  
Y test
       laeq
0  0.626733
1 -0.406423
2  0.059854
3  0.738278
4 -0.140289


<h2> Fit the model </h2>

In [51]:
# Initialize ARDRegression
ard_reg = ARDRegression()

# Fit the model to the training data
ard_reg.fit(X_train, Y_train)

# Make predictions on new data (X_test)
Y_pred_std = ard_reg.predict(X_test_std)

# Map coefficients to column names
coefficients = pd.Series(ard_reg.coef_, index=X_train.columns)

coefficients.to_csv("ARD_coefficients_train_data_without_Vrijthof_with_split_without_weekday")

# Print the coefficients and corresponding column names
for feature, coefficient in coefficients.items():
    print(f"Feature: {feature}, Coefficient: {round(coefficient,2)}")

Feature: rain, Coefficient: 0.05
Feature: winddir, Coefficient: 0.02
Feature: windspeed, Coefficient: 0.04
Feature: temperature, Coefficient: 0.0
Feature: coordinate, Coefficient: 0.14
Feature: sin_month, Coefficient: 0.02
Feature: cos_month, Coefficient: 0.05
Feature: sin_day, Coefficient: -0.03
Feature: cos_day, Coefficient: 0.0
Feature: sin_hour, Coefficient: -0.42
Feature: cos_hour, Coefficient: -0.46



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



<h2> MSE and R2 </h2>

In [52]:
mse = mean_squared_error(Y_test_std["laeq"], Y_pred_std)
print(mse)

r2 = r2_score(Y_test_std["laeq"], Y_pred_std)
print(r2)

0.5631173069110588
0.44577037659496466


<h2> Horizontal bar chart </h2>

In [53]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=coefficients.values, #["coefficient"],  # Use the coefficient as the length of the bar
    y=coefficients.index, #["feature"],   # Use the coefficient name as the y-axis
    orientation='h', # Specify horizontal orientation
    marker=dict(
        color=np.where(coefficients.values < 0, 'red', 'green')
    )
))

fig.update_layout(
    title='Feature importance with ARD regression',
    xaxis_title='Coefficient',
    yaxis_title='Feature name',
    width=400,  # Set the width of the plot to 800 pixels
    bargap=0.1,  # Set the gap between bars to 0.1 (adjust as needed)
    margin=dict(l=0, r=20, t=40, b=0)  # Set all margins to 0
)

fig.show()