In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data
df = pd.read_csv(".\\..\\data\\raw_weather_data_new_delhi\\DailyDelhiClimateTrain.csv")

# Convert date column to datetime format
df["date"] = pd.to_datetime(df["date"], format='%Y-%m-%d')
df['year'] = df['date'].dt.year
df["month"] = df["date"].dt.month
print(df.head())

# Shift the target variables to use next day's values
df['meantemp_next'] = df['meantemp'].shift(-1)
df['humidity_next'] = df['humidity'].shift(-1)
df['wind_speed_next'] = df['wind_speed'].shift(-1)
df['meanpressure_next'] = df['meanpressure'].shift(-1)

# Fill the last row's NaN values with the previous day's values
df.ffill()

print(df)




        date   meantemp   humidity  wind_speed  meanpressure  year  month
0 2013-01-01  10.000000  84.500000    0.000000   1015.666667  2013      1
1 2013-01-02   7.400000  92.000000    2.980000   1017.800000  2013      1
2 2013-01-03   7.166667  87.000000    4.633333   1018.666667  2013      1
3 2013-01-04   8.666667  71.333333    1.233333   1017.166667  2013      1
4 2013-01-05   6.000000  86.833333    3.700000   1016.500000  2013      1
           date   meantemp    humidity  wind_speed  meanpressure  year  month  \
0    2013-01-01  10.000000   84.500000    0.000000   1015.666667  2013      1   
1    2013-01-02   7.400000   92.000000    2.980000   1017.800000  2013      1   
2    2013-01-03   7.166667   87.000000    4.633333   1018.666667  2013      1   
3    2013-01-04   8.666667   71.333333    1.233333   1017.166667  2013      1   
4    2013-01-05   6.000000   86.833333    3.700000   1016.500000  2013      1   
...         ...        ...         ...         ...           ...   ...

In [2]:
last_row = df.iloc[-1]
print(last_row)

date                 2017-01-01 00:00:00
meantemp                            10.0
humidity                           100.0
wind_speed                           0.0
meanpressure                      1016.0
year                                2017
month                                  1
meantemp_next                        NaN
humidity_next                        NaN
wind_speed_next                      NaN
meanpressure_next                    NaN
Name: 1461, dtype: object


In [3]:
second_last_row = df.iloc[-2]
print(second_last_row)

date                 2016-12-31 00:00:00
meantemp                       15.052632
humidity                            87.0
wind_speed                         7.325
meanpressure                      1016.1
year                                2016
month                                 12
meantemp_next                       10.0
humidity_next                      100.0
wind_speed_next                      0.0
meanpressure_next                 1016.0
Name: 1460, dtype: object


In [4]:
for col in ['meantemp_next', 'humidity_next', 'wind_speed_next', 'meanpressure_next']:
    # Get the second-to-last value
    second_last_value = df[col].iloc[-2]
    
    # Replace the last value with the second-to-last value
    df.at[df.index[-1], col] = second_last_value

last_row = df.iloc[-1]
print(last_row)

date                 2017-01-01 00:00:00
meantemp                            10.0
humidity                           100.0
wind_speed                           0.0
meanpressure                      1016.0
year                                2017
month                                  1
meantemp_next                       10.0
humidity_next                      100.0
wind_speed_next                      0.0
meanpressure_next                 1016.0
Name: 1461, dtype: object


In [5]:
# Preparing the feature matrix X and target matrix y
X = df[['meantemp', 'humidity', 'wind_speed', 'meanpressure', 'year', 'month']]
y = df[['meantemp_next', 'humidity_next', 'wind_speed_next', 'meanpressure_next']]  # Next day's values

# Initialize the Ridge regression model for each target variable
ridge = Ridge()

# Define a range of alpha values to tune the regularization strength
params = {'alpha': [0.1, 1.0, 10.0, 100.0]}

# TimeSeriesSplit is used here since we're dealing with time-series data
tscv = TimeSeriesSplit(n_splits=5)

# Dictionary to store predictions and metrics for each target variable
predictions = {}
metrics = {}

for col in y.columns:
    # GridSearchCV with TimeSeriesSplit for each target variable
    ridge_cv = GridSearchCV(ridge, params, cv=tscv, scoring='neg_mean_squared_error')

    # Fit the model
    ridge_cv.fit(X, y[col])

    # Best alpha value
    print(f"Best alpha for {col}: {ridge_cv.best_params_['alpha']}")

    # Predictions
    y_pred = ridge_cv.predict(X)

    # Store predictions in the dictionary
    predictions[col] = y_pred

    # Calculate RMSE (Root Mean Squared Error) for evaluation
    rmse = np.sqrt(mean_squared_error(y[col], y_pred))
    metrics[col] = rmse
    print(f"RMSE for {col}: {rmse}")

# Convert predictions dictionary to DataFrame
df_predictions = pd.DataFrame(predictions, index=df.index)

# Print the DataFrame with actual and predicted values for comparison
column_mapping = {
    'meantemp_next': 'meantemp',
    'humidity_next': 'humidity',
    'wind_speed_next': 'wind_speed',
    'meanpressure_next': 'meanpressure'
}

# Rename columns
df_predictions.rename(columns=column_mapping, inplace=True)
print(df_predictions)

Best alpha for meantemp_next: 0.1
RMSE for meantemp_next: 1.6487913437751103
Best alpha for humidity_next: 100.0
RMSE for humidity_next: 7.963394902536819
Best alpha for wind_speed_next: 0.1
RMSE for wind_speed_next: 3.9551060636492696
Best alpha for meanpressure_next: 100.0
RMSE for meanpressure_next: 179.86405343965092
       meantemp   humidity  wind_speed  meanpressure
0     10.666696  81.581983    3.368105   1027.415645
1      8.106037  88.498134    3.908912   1027.786751
2      7.798526  84.351540    4.534096   1028.908011
3      9.198569  70.525055    3.896795   1032.075091
4      6.655770  84.273053    4.095119   1030.602225
...         ...        ...         ...           ...
1457  17.212688  68.427927    3.573033   1018.026806
1458  15.392892  85.795589    3.756151   1014.620731
1459  14.271679  87.488429    3.678038   1015.450153
1460  15.177433  85.173845    4.192549   1014.705407
1461  10.847606  94.662072    2.793233   1039.180658

[1462 rows x 4 columns]


In [6]:
# Optionally, you can merge this with the original DataFrame to see actual vs predicted
df_comparison = df[['meantemp', 'humidity', 'wind_speed', 'meanpressure']].join(df_predictions, lsuffix='_actual', rsuffix='_predicted')

# Display the comparison DataFrame
print(df_comparison)
df_comparison["date"] = df["date"]

      meantemp_actual  humidity_actual  wind_speed_actual  \
0           10.000000        84.500000           0.000000   
1            7.400000        92.000000           2.980000   
2            7.166667        87.000000           4.633333   
3            8.666667        71.333333           1.233333   
4            6.000000        86.833333           3.700000   
...               ...              ...                ...   
1457        17.217391        68.043478           3.547826   
1458        15.238095        87.857143           6.000000   
1459        14.095238        89.666667           6.266667   
1460        15.052632        87.000000           7.325000   
1461        10.000000       100.000000           0.000000   

      meanpressure_actual  meantemp_predicted  humidity_predicted  \
0             1015.666667           10.666696           81.581983   
1             1017.800000            8.106037           88.498134   
2             1018.666667            7.798526           84.3

In [7]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd


# Creating a 2x2 subplot figure
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Mean Temperature", "Humidity", "Wind Speed", "Mean Pressure"),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Plotting Mean Temperature
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['meantemp_actual'], mode='lines', name='Actual Temp', line=dict(color='blue')),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['meantemp_predicted'], mode='lines', name='Predicted Temp', line=dict(color='red')),
    row=1, col=1
)

# Plotting Humidity
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['humidity_actual'], mode='lines', name='Actual Humidity', line=dict(color='blue')),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['humidity_predicted'], mode='lines', name='Predicted Humidity', line=dict(color='red')),
    row=1, col=2
)

# Plotting Wind Speed
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['wind_speed_actual'], mode='lines', name='Actual Wind Speed', line=dict(color='blue')),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['wind_speed_predicted'], mode='lines', name='Predicted Wind Speed', line=dict(color='red')),
    row=2, col=1
)

# Plotting Mean Pressure
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['meanpressure_actual'], mode='lines', name='Actual Pressure', line=dict(color='blue')),
    row=2, col=2
)
fig.add_trace(
    go.Scatter(x=df_comparison['date'], y=df_comparison['meanpressure_predicted'], mode='lines', name='Predicted Pressure', line=dict(color='red')),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title="Actual vs Predicted Values for Weather Parameters",
    xaxis_title="Date",
    yaxis_title="Value",
    legend=dict(x=1.05, y=1),
    template="plotly_white",
    height=800
)

# Show the plot
fig.show()


In [8]:
import plotly.graph_objs as go
import pandas as pd

# Function to plot individual weather parameters
def plot_weather_parameter(actual, predicted, title, yaxis_title):
    fig = go.Figure()

    # Plot actual values
    fig.add_trace(
        go.Scatter(x=df_comparison['date'], y=actual, mode='lines', name='Actual', line=dict(color='blue'))
    )

    # Plot predicted values
    fig.add_trace(
        go.Scatter(x=df_comparison['date'], y=predicted, mode='lines', name='Predicted', line=dict(color='red'))
    )

    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title="Date",
        yaxis_title=yaxis_title,
        template="plotly_white"
    )

    # Show the plot
    fig.show()

# Plot Mean Temperature
plot_weather_parameter(
    actual=df_comparison['meantemp_actual'],
    predicted=df_comparison['meantemp_predicted'],
    title="Actual vs Predicted Mean Temperature",
    yaxis_title="Mean Temperature"
)

# Plot Humidity
plot_weather_parameter(
    actual=df_comparison['humidity_actual'],
    predicted=df_comparison['humidity_predicted'],
    title="Actual vs Predicted Humidity",
    yaxis_title="Humidity"
)

# Plot Wind Speed
plot_weather_parameter(
    actual=df_comparison['wind_speed_actual'],
    predicted=df_comparison['wind_speed_predicted'],
    title="Actual vs Predicted Wind Speed",
    yaxis_title="Wind Speed"
)

# Plot Mean Pressure
plot_weather_parameter(
    actual=df_comparison['meanpressure_actual'],
    predicted=df_comparison['meanpressure_predicted'],
    title="Actual vs Predicted Mean Pressure",
    yaxis_title="Mean Pressure"
)


In [9]:
import plotly.express as px
import pandas as pd


# Plot Mean Temperature
fig_meantemp_actual = px.line(df_comparison, x='date', y='meantemp_actual', 
                              title="Actual Mean Temperature Over Time", 
                              labels={'meantemp_actual': 'Mean Temperature'})
fig_meantemp_actual.show()

fig_meantemp_predicted = px.line(df_comparison, x='date', y='meantemp_predicted', 
                                 title="Predicted Mean Temperature Over Time", 
                                 labels={'meantemp_predicted': 'Mean Temperature'})
fig_meantemp_predicted.show()




In [10]:
# Plot Humidity
fig_humidity_actual = px.line(df_comparison, x='date', y='humidity_actual', 
                              title="Actual Humidity Over Time", 
                              labels={'humidity_actual': 'Humidity'})
fig_humidity_actual.show()
fig_humidity_predicted = px.line(df_comparison, x='date', y='humidity_predicted', 
                                 title="Predicted Humidity Over Time", 
                                 labels={'humidity_predicted': 'Humidity'})
fig_humidity_predicted.show()




In [11]:
# Plot Wind Speed
fig_wind_speed_actual = px.line(df_comparison, x='date', y='wind_speed_actual', 
                                title="Actual Wind Speed Over Time", 
                                labels={'wind_speed_actual': 'Wind Speed'})
fig_wind_speed_actual.show()

fig_wind_speed_predicted = px.line(df_comparison, x='date', y='wind_speed_predicted', 
                                   title="Predicted Wind Speed Over Time", 
                                   labels={'wind_speed_predicted': 'Wind Speed'})
fig_wind_speed_predicted.show()



In [12]:
# Plot Mean Pressure
fig_meanpressure_actual = px.line(df_comparison, x='date', y='meanpressure_actual', 
                                  title="Actual Mean Pressure Over Time", 
                                  labels={'meanpressure_actual': 'Mean Pressure'})
fig_meanpressure_actual.show()

fig_meanpressure_predicted = px.line(df_comparison, x='date', y='meanpressure_predicted', 
                                     title="Predicted Mean Pressure Over Time", 
                                     labels={'meanpressure_predicted': 'Mean Pressure'})
fig_meanpressure_predicted.show()

In [13]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score


def calculate_metrics(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    explained_variance = explained_variance_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    print(f"Metrics for {label}:")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R-squared (R²): {r2:.2f}")
    print(f"Explained Variance Score: {explained_variance:.2f}")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%\n")

# Calculate metrics for Mean Temperature
calculate_metrics(df_comparison['meantemp_actual'], df_comparison['meantemp_predicted'], "Mean Temperature")

# Calculate metrics for Humidity
calculate_metrics(df_comparison['humidity_actual'], df_comparison['humidity_predicted'], "Humidity")

# Calculate metrics for Wind Speed
# calculate_metrics(df_comparison['wind_speed_actual'], df_comparison['wind_speed_predicted'], "Wind Speed")

# Calculate metrics for Mean Pressure
calculate_metrics(df_comparison['meanpressure_actual'], df_comparison['meanpressure_predicted'], "Mean Pressure")


Metrics for Mean Temperature:
Mean Squared Error (MSE): 0.07
Root Mean Squared Error (RMSE): 0.27
Mean Absolute Error (MAE): 0.21
R-squared (R²): 1.00
Explained Variance Score: 1.00
Mean Absolute Percentage Error (MAPE): 1.02%

Metrics for Humidity:
Mean Squared Error (MSE): 4.65
Root Mean Squared Error (RMSE): 2.16
Mean Absolute Error (MAE): 1.74
R-squared (R²): 0.98
Explained Variance Score: 0.98
Mean Absolute Percentage Error (MAPE): 3.57%

Metrics for Mean Pressure:
Mean Squared Error (MSE): 32367.41
Root Mean Squared Error (RMSE): 179.91
Mean Absolute Error (MAE): 13.66
R-squared (R²): 0.00
Explained Variance Score: 0.00
Mean Absolute Percentage Error (MAPE): 29.32%



In [14]:
df_zero_wind = df[df['wind_speed'] == 0]
print(df_zero_wind.info())

<class 'pandas.core.frame.DataFrame'>
Index: 26 entries, 0 to 1461
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               26 non-null     datetime64[ns]
 1   meantemp           26 non-null     float64       
 2   humidity           26 non-null     float64       
 3   wind_speed         26 non-null     float64       
 4   meanpressure       26 non-null     float64       
 5   year               26 non-null     int32         
 6   month              26 non-null     int32         
 7   meantemp_next      26 non-null     float64       
 8   humidity_next      26 non-null     float64       
 9   wind_speed_next    26 non-null     float64       
 10  meanpressure_next  26 non-null     float64       
dtypes: datetime64[ns](1), float64(8), int32(2)
memory usage: 2.2 KB
None
