In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ARDRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
import plotly.graph_objects as go

data_model = pd.read_csv("../Data/data_model.csv")
data_model.head()
#data_model["LC_TEMP_QCL3"]


Unnamed: 0,month,day,hour,description,lamax,laeq,LC_RAININ,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,...,weekday,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,1,1,0,MP 03: Naamsestraat 62 Taste,60.322528,57.126833,3e-06,0.00036,-18.197324,0.389565,...,6,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,1,1,0,MP 05: Calvariekapel KU Leuven,53.230972,49.987639,3e-06,0.00036,-18.197324,0.389565,...,6,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,1,1,0,MP 06: Parkstraat 2 La Filosovia,53.666056,50.752,3e-06,0.00036,-18.197324,0.389565,...,6,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,1,1,0,MP 07: Naamsestraat 81,50.056861,47.440222,3e-06,0.00036,-18.197324,0.389565,...,6,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,1,1,1,MP 03: Naamsestraat 62 Taste,53.033583,50.853806,7e-06,0.0,-16.227891,0.222602,...,6,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


In [2]:
#see the impact on the coordinate coefficient if vrijhof is removed as this is a kind of outlier
#having a lower noise than the rest
print(len(data_model))
# Drop rows where "description" is "vrijthof"
data_model = data_model.drop(data_model[data_model["description"] == "MP08bis - Vrijthof"].index)
print(data_model.head())
print(len(data_model))

55469
   month  day  hour                       description      lamax       laeq   
0      1    1     0      MP 03: Naamsestraat 62 Taste  60.322528  57.126833  \
1      1    1     0    MP 05: Calvariekapel KU Leuven  53.230972  49.987639   
2      1    1     0  MP 06: Parkstraat 2 La Filosovia  53.666056  50.752000   
3      1    1     0            MP 07: Naamsestraat 81  50.056861  47.440222   
4      1    1     1      MP 03: Naamsestraat 62 Taste  53.033583  50.853806   

   LC_RAININ  LC_DAILYRAIN  LC_WINDDIR  LC_WINDSPEED  ...  weekday   
0   0.000003       0.00036  -18.197324      0.389565  ...        6  \
1   0.000003       0.00036  -18.197324      0.389565  ...        6   
2   0.000003       0.00036  -18.197324      0.389565  ...        6   
3   0.000003       0.00036  -18.197324      0.389565  ...        6   
4   0.000007       0.00000  -16.227891      0.222602  ...        6   

   coordinate  sin_month  cos_month   sin_day  cos_day  sin_hour  cos_hour   
0    0.414698       

In [3]:
# List of column names to drop
columns_to_drop = ['month', 'day','hour','description','lamax','LC_RAININ','weekday']

# Drop the specified columns
data_model = data_model.drop(columns=columns_to_drop)
data_model.head()

Unnamed: 0,laeq,LC_DAILYRAIN,LC_WINDDIR,LC_WINDSPEED,LC_TEMP_QCL3,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


In [4]:
# Dictionary mapping old column names to new column names
column_mapping = {'LC_DAILYRAIN': 'rain',
                  'LC_WINDDIR': 'winddir',
                  'LC_WINDSPEED': 'windspeed',
                  'LC_TEMP_QCL3': 'temperature',}

# Rename the columns
data_model = data_model.rename(columns=column_mapping)
data_model.head()

Unnamed: 0,laeq,rain,winddir,windspeed,temperature,coordinate,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_weekday,cos_weekday
0,57.126833,0.00036,-18.197324,0.389565,13.100358,0.414698,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
1,49.987639,0.00036,-18.197324,0.389565,13.100358,0.130227,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
2,50.752,0.00036,-18.197324,0.389565,13.100358,0.053974,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
3,47.440222,0.00036,-18.197324,0.389565,13.100358,0.0,0.5,0.866025,0.201299,0.97953,0.0,1.0,-0.781831,0.62349
4,50.853806,0.0,-16.227891,0.222602,12.669197,0.414698,0.5,0.866025,0.201299,0.97953,0.258819,0.965926,-0.781831,0.62349


In [5]:
#print(data_model.head())

# Initialize StandardScaler
scaler = StandardScaler()

# Fit StandardScaler to the DataFrame
#scaler.fit(data_model)

# Transform the DataFrame to standardized values
data_model_std= scaler.fit_transform(data_model)

# Convert standardized values back to DataFrame with column names
data_model_std = pd.DataFrame(data_model_std, columns=data_model.columns)

print(data_model_std)

           laeq      rain   winddir  windspeed  temperature  coordinate   
0      0.936750 -0.276451 -0.648873   0.344955     0.061980    0.545825  \
1     -0.230746 -0.276451 -0.648873   0.344955     0.061980   -0.627788   
2     -0.105748 -0.276451 -0.648873   0.344955     0.061980   -0.942375   
3     -0.647334 -0.276451 -0.648873   0.344955     0.061980   -1.165049   
4     -0.089099 -0.382959 -0.553979  -0.177611     0.002462    0.545825   
...         ...       ...       ...        ...          ...         ...   
48830  0.860573  0.261181 -1.178453   3.712663     0.430610    0.545825   
48831  1.242402  0.261181 -1.178453   3.712663     0.430610   -0.016738   
48832  0.862363  0.261181 -1.178453   3.712663     0.430610   -0.627788   
48833  0.648911  0.261181 -1.178453   3.712663     0.430610   -0.942375   
48834  0.280432  0.261181 -1.178453   3.712663     0.430610   -1.165049   

       sin_month  cos_month   sin_day   cos_day  sin_hour  cos_hour   
0       0.750087   1.267658 

In [6]:
# Extract the feature columns (X_train)
X_data = data_model_std.drop("laeq", axis=1)

# Extract the target variable column (Y_train)
Y_data = data_model_std["laeq"]

print(X_data)
print("training data")
print(Y_data.head())

           rain   winddir  windspeed  temperature  coordinate  sin_month   
0     -0.276451 -0.648873   0.344955     0.061980    0.545825   0.750087  \
1     -0.276451 -0.648873   0.344955     0.061980   -0.627788   0.750087   
2     -0.276451 -0.648873   0.344955     0.061980   -0.942375   0.750087   
3     -0.276451 -0.648873   0.344955     0.061980   -1.165049   0.750087   
4     -0.382959 -0.553979  -0.177611     0.002462    0.545825   0.750087   
...         ...       ...        ...          ...         ...        ...   
48830  0.261181 -1.178453   3.712663     0.430610    0.545825   0.032559   
48831  0.261181 -1.178453   3.712663     0.430610   -0.016738   0.032559   
48832  0.261181 -1.178453   3.712663     0.430610   -0.627788   0.032559   
48833  0.261181 -1.178453   3.712663     0.430610   -0.942375   0.032559   
48834  0.261181 -1.178453   3.712663     0.430610   -1.165049   0.032559   

       cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday   
0       1.2676

In [7]:
# Initialize ARDRegression
ard_reg = ARDRegression()

# Fit the model to the training data
ard_reg.fit(X_data, Y_data)

# Make predictions on new data (X_test)
#Y_pred = ard_reg.predict(X_test)

# Map coefficients to column names
coefficients = pd.Series(ard_reg.coef_, index=X_data.columns)

# Print the coefficients and corresponding column names
for feature, coefficient in coefficients.items():
    print(f"Feature: {feature}, Coefficient: {coefficient}")

Feature: rain, Coefficient: 0.05434745881461586
Feature: winddir, Coefficient: 0.04389144947993497
Feature: windspeed, Coefficient: 0.05312804088784786
Feature: temperature, Coefficient: -0.03557369531318458
Feature: coordinate, Coefficient: 0.14255875157610232
Feature: sin_month, Coefficient: 0.0
Feature: cos_month, Coefficient: 0.019297069028890945
Feature: sin_day, Coefficient: -0.02949371113389723
Feature: cos_day, Coefficient: 0.0
Feature: sin_hour, Coefficient: -0.4269838337530125
Feature: cos_hour, Coefficient: -0.4627176014644791
Feature: sin_weekday, Coefficient: -0.037165706472224336
Feature: cos_weekday, Coefficient: -0.20637981288717916


In [8]:
#see how good it is for predictions

#to get consistent results if we want to redo it
np.random.seed(7)

X_train,X_test, Y_train,Y_test = train_test_split(X_data,Y_data,test_size=0.2,random_state=1)

print(len(X_train))
print(len(X_test))

print(X_train.head())

39068
9767
           rain   winddir  windspeed  temperature  coordinate  sin_month   
636    0.902765 -1.482737   1.445852    -1.215543    0.545825   0.750087  \
15898 -0.382959 -0.446706  -0.653007     0.173142    0.545825   0.750087   
14389 -0.366501  0.187262  -0.055828    -0.163174   -1.165049   1.275355   
18825 -0.382959 -1.382117   0.337863     0.835979    1.091343   0.750087   
19072 -0.382959  0.245409  -0.819979    -0.688080    0.545825   0.032559   

       cos_month   sin_day   cos_day  sin_hour  cos_hour  sin_weekday   
636     1.267658  1.390059  0.212801 -0.999028 -1.000151    -1.378748  \
15898  -1.152472  1.111151 -0.869623  1.225735  0.706943     0.613126   
14389  -0.641039 -0.004489  1.415781 -1.223774 -0.707260    -1.105735   
18825  -1.152472 -0.004489  1.415781  0.000980 -1.414361     1.378091   
19072  -1.339670  0.569342  1.293218  1.367017  0.365865    -0.613783   

       cos_weekday  
636      -0.314427  
15898    -1.274132  
14389     0.882305  
18825    

In [9]:
# Initialize ARDRegression
ard_reg = ARDRegression()

# Fit the model to the training data
ard_reg.fit(X_train, Y_train)

# Make predictions on new data (X_test)
Y_pred = ard_reg.predict(X_test)

# Map coefficients to column names
coefficients = pd.Series(ard_reg.coef_, index=X_train.columns)

# Print the coefficients and corresponding column names
for feature, coefficient in coefficients.items():
    print(f"Feature: {feature}, Coefficient: {coefficient}")

Feature: rain, Coefficient: 0.05356379471590889
Feature: winddir, Coefficient: 0.04324455136269812
Feature: windspeed, Coefficient: 0.05129806570454859
Feature: temperature, Coefficient: -0.03405270824448812
Feature: coordinate, Coefficient: 0.14107925537516638
Feature: sin_month, Coefficient: 0.0
Feature: cos_month, Coefficient: 0.02452323530048164
Feature: sin_day, Coefficient: -0.02677884841979148
Feature: cos_day, Coefficient: 0.0
Feature: sin_hour, Coefficient: -0.42474137670926143
Feature: cos_hour, Coefficient: -0.46182738624062974
Feature: sin_weekday, Coefficient: -0.0384751902223342
Feature: cos_weekday, Coefficient: -0.20765604423125342


In [10]:
mse = mean_squared_error(Y_test, Y_pred)
print(mse)

r2 = r2_score(Y_test, Y_pred)
print(r2)

0.5203380062527941
0.48618802265837335


In [17]:
import matplotlib.pyplot as plt

# Assuming you have Y_test and Y_pred as the true and predicted target values

# Create a scatter plot
#plt.scatter(range(len(Y_test)), Y_test, color='b', label='True Values')
#plt.scatter(range(len(Y_pred)), Y_pred, color='r', label='Predicted Values')

# Create the scatter plot trace
scatter_trace = go.Scatter(
    x=Y_test,
    y=Y_pred,
    mode='markers',
    marker=dict(
        color='blue',
        size=5,
    ),
    name='True Values'
)

# Create the layout
layout = go.Layout(
    xaxis=dict(title='True/target values'),
    yaxis=dict(title='Prediction Values'),
    title='True Values vs Predicted Values'
)

# Create the line trace
line_trace = go.Scatter(
    x=[-1.5, 1.5],
    y=[-1.5, 1.5],
    mode='lines',
    line=dict(color='red', width=2),
    name='Perfect prediction'
)

# Create the figure
figure = go.Figure(data=[scatter_trace,line_trace], layout=layout)

# Show the figure
figure.show()

In [12]:
print(Y_test[1000])
print(Y_pred[1000])

0.8440217250791616
-0.8456640636388458


In [13]:


# Assuming you have Y_test as the true target values

# Create the histogram trace
histogram_trace = go.Histogram(x=Y_test)

# Create the layout
layout = go.Layout(title='Histogram of Y_test')

# Create the figure
figure = go.Figure(data=[histogram_trace], layout=layout)

# Show the figure
figure.show()