## Machine Learning Mutli-variate Analysis for Weather Forecast

In [2]:
%matplotlib inline

# import required libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [5]:
df = pd.read_csv('/kaggle/input/hanoi-weather-data/hanoi 2015-01-01 to 2025-07-03.csv')
# Convert datetime column to proper format
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

In [6]:
# suppress warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

In [7]:
# initialize a list to store test results
test_ar = []

In [8]:
# Accuracy metrics
def accuracy_metrics(forecast, actual):
    mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    mae = np.mean(np.abs(forecast - actual))    # MAE
    rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    print('Mean Absolute Percent Error ', mape, 'Mean Absolute Error', mae, 'Root Mean Square Error (rmse) ' , rmse)
    return mape, mae, rmse

In [10]:
# Display feature names
features = df.columns
print("Features:", features)

# Display dataset shape
print("Dataset Shape:", df.shape)

# Scale the dataset between 0.1 and 1.1 using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0.1, 1.1))
df_scaled = scaler.fit_transform(df.select_dtypes(include=['float64', 'int64']))  # Scale only numerical data

# Convert back to DataFrame for easier handling
df_scaled = pd.DataFrame(df_scaled, columns=df.select_dtypes(include=['float64', 'int64']).columns, index=df.index)
df_scaled

Features: Index(['name', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslikemax',
       'feelslikemin', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'preciptype', 'snow', 'snowdepth', 'windgust',
       'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility',
       'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'sunrise',
       'sunset', 'moonphase', 'conditions', 'description', 'icon', 'stations'],
      dtype='object')
Dataset Shape: (3719, 33)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipprob,...,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,moonphase
0,0.574777,0.319124,0.475439,0.459551,0.342820,0.414904,0.475817,0.574601,0.100000,0.1,...,0.238004,0.323889,0.800651,0.104,0.457664,0.710201,0.712903,0.8,,0.467347
1,0.515430,0.303187,0.426316,0.414607,0.332376,0.381250,0.475817,0.599274,0.100000,0.1,...,0.257113,0.381389,0.785466,0.381,0.501460,0.679907,0.680645,0.8,,0.497959
2,0.485757,0.386853,0.450877,0.392135,0.387206,0.398077,0.573856,0.741509,0.100000,0.1,...,0.238004,0.370000,0.726898,0.509,0.362774,0.457032,0.454839,0.5,,0.538776
3,0.524332,0.538247,0.531579,0.421348,0.486423,0.453365,0.694771,0.837300,0.101811,1.1,...,0.318684,0.372500,0.585900,0.980,0.391971,0.298764,0.300717,0.4,,0.569388
4,0.545104,0.601992,0.570175,0.437079,0.528198,0.479808,0.769935,0.930189,0.100000,0.1,...,0.238004,0.338333,0.536009,1.000,0.231387,0.389645,0.383154,0.5,,0.610204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3714,0.663798,0.729482,0.689474,0.576404,0.611749,0.566346,0.884314,0.944702,0.100625,1.1,...,0.295329,0.416389,0.479610,1.067,0.413869,0.652705,0.651971,0.8,0.100000,0.212245
3715,0.723145,0.769323,0.728070,0.637079,0.637859,0.602404,0.890850,0.885196,0.100015,1.1,...,0.397240,0.445556,0.447072,1.002,0.413869,0.692272,0.691398,0.9,0.152632,0.253061
3716,0.693472,0.701594,0.703509,0.607865,0.593473,0.578365,0.861438,0.867779,0.104830,1.1,...,0.435456,0.418056,0.486117,1.065,0.340876,0.252705,0.250538,0.3,0.152632,0.283673
3717,0.456083,0.462550,0.461404,0.369663,0.436815,0.405288,0.501961,0.573149,0.106340,1.1,...,0.550106,0.261111,0.703037,1.100,0.742336,0.169243,0.168100,0.2,0.152632,0.355102


In [13]:
# Adjust `n_outputs` to be the number of meaningful weather attributes
# Selecting relevant weather features (excluding categorical and redundant data)
selected_features = ["temp", "humidity", "sealevelpressure", "windspeed"]
df_selected = df_scaled[selected_features]

# Define new `n_outputs`
n_outputs = len(selected_features)  # Now only considering meaningful numerical attributes

# Ensure `n_outputs` is valid
if n_outputs <= 0:
    print("Error: No valid features selected for n_outputs. Please check feature selection.")
else:
    # Define sample size
    num_samples = df_selected.shape[0] - lags_size - 1

    # Ensure there are enough samples
    if num_samples > 0:
        # Initialize arrays
        a = np.empty((num_samples, lags_size * n_outputs))
        y_new = np.empty((num_samples, n_outputs))

        # Construct input and output matrices
        for i in range(num_samples):
            for j in range(n_outputs): 
                for k in range(lags_size):
                    a[i][j * lags_size + k] = df_selected.iloc[i + k, j]  # Using iloc for DataFrame indexing
                y_new[i][j] = df_selected.iloc[i + lags_size, j]  # Target variable

        # Convert to DataFrame for better usability
        X_new = pd.DataFrame(a)

        # Print shape of input dataset
        print("Input Feature Set Shape:", X_new.shape)

        # Splitting Data into Training and Testing Sets (80/20 ratio)
        test_size = 10  # 10 hours for each city
        training_size = X_new.shape[0] - test_size

        x_train = X_new.iloc[:training_size]
        x_test = X_new.iloc[training_size: training_size + test_size]
        y_train = y_new[:training_size]
        y_test = y_new[training_size:training_size + test_size]

        # Print training and testing set shapes
        print("Training Set Shape (X):", x_train.shape)
        print("Test Set Shape (X):", x_test.shape)
        print("Training Set Shape (Y):", y_train.shape)
        print("Test Set Shape (Y):", y_test.shape)
    else:
        print("Error: Not enough samples to proceed with the transformation.")


Input Feature Set Shape: (3712, 24)
Training Set Shape (X): (3702, 24)
Test Set Shape (X): (10, 24)
Training Set Shape (Y): (3702, 4)
Test Set Shape (Y): (10, 4)


### Random Forest

In [21]:
# Import necessary library for RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Define the RandomForestRegressor model with hyperparameters
model_rf = RandomForestRegressor(
    random_state=0, 
    n_estimators=500, 
    max_depth=None, 
    max_features=min(20, x_train.shape[1]),  # Ensure max_features does not exceed feature count
    min_samples_split=10, 
    min_samples_leaf=10, 
    bootstrap=False
)

# Fit the model
model_rf.fit(x_train, y_train)

# Predict on test data
y_pred_rf = model_rf.predict(x_test)

# Define accuracy metrics function
def accuracy_metrics(y_pred, y_true):
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100  # Mean Absolute Percentage Error
    mae = mean_absolute_error(y_true, y_pred)  # Mean Absolute Error
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # Root Mean Squared Error
    return mape, mae, rmse

# Calculate metrics
mape, mae, rmse = accuracy_metrics(y_pred_rf, y_test)

# Store results
test_ar = []
test_ar.append({'label': 'RandomForestRegressor', 'mape': mape, 'mae': mae, 'rmse': rmse})

results_df = pd.DataFrame(test_ar)
results_df


Unnamed: 0,label,mape,mae,rmse
0,RandomForestRegressor,14.505748,0.074134,0.095029


### Linear Regression

In [22]:
# Import necessary library for Linear Regression
from sklearn.linear_model import LinearRegression

# Define the Linear Regression model
model_lr = LinearRegression(fit_intercept=False)

# Fit the model
model_lr.fit(x_train, y_train)

# Predict on test data
y_pred_lr = model_lr.predict(x_test)

# Calculate metrics
mape, mae, rmse = accuracy_metrics(y_pred_lr, y_test)

# Store results
test_ar.append({'label': 'LinearRegression', 'mape': mape, 'mae': mae, 'rmse': rmse})

# Display updated results
results_df = pd.DataFrame(test_ar)
results_df

Unnamed: 0,label,mape,mae,rmse
0,RandomForestRegressor,14.505748,0.074134,0.095029
1,LinearRegression,12.814262,0.062376,0.085899


In [28]:
# Import necessary libraries
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Reset results list
test_ar = []

# Define and train Multi-Output Gradient Boosting Regressor
model_gb = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=0))
model_gb.fit(x_train, y_train)
y_pred_gb = model_gb.predict(x_test)
mape, mae, rmse = accuracy_metrics(y_pred_gb, y_test)
test_ar.append({'label': 'GradientBoosting (MultiOutput)', 'mape': mape, 'mae': mae, 'rmse': rmse})

# Define and train Multi-Output Support Vector Regression (SVR)
model_svr = MultiOutputRegressor(SVR(kernel='rbf', C=100, epsilon=0.1))
model_svr.fit(x_train, y_train)
y_pred_svr = model_svr.predict(x_test)
mape, mae, rmse = accuracy_metrics(y_pred_svr, y_test)
test_ar.append({'label': 'SVR (MultiOutput)', 'mape': mape, 'mae': mae, 'rmse': rmse})

# Define and train Multi-Output K-Nearest Neighbors Regressor (KNN)
model_knn = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5, weights='distance'))
model_knn.fit(x_train, y_train)
y_pred_knn = model_knn.predict(x_test)
mape, mae, rmse = accuracy_metrics(y_pred_knn, y_test)
test_ar.append({'label': 'KNN (MultiOutput)', 'mape': mape, 'mae': mae, 'rmse': rmse})

results_df = pd.DataFrame(test_ar)
results_df

Unnamed: 0,label,mape,mae,rmse
0,GradientBoosting (MultiOutput),14.91938,0.074718,0.102588
1,SVR (MultiOutput),14.078086,0.073025,0.09684
2,KNN (MultiOutput),16.617498,0.08398,0.10955
