In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import os

In [18]:
# Pre-processing
print("[INFO] Current working directory:", os.getcwd())
# Display configuration
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Read data
file_path = Path("../../data/processed/full_hourly_weather.csv")
df = pd.read_csv(file_path, parse_dates=["date"])
print("[INFO] Data loaded:", df.shape)

[INFO] Current working directory: c:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\notebooks\pre_process
[INFO] Data loaded: (747768, 50)


In [19]:
df = df.iloc[14:].copy()  # Xoá 14 dòng đầu


In [20]:
# Xoá cột không có dữ liệu
drop_cols = ["albedo", "snow_depth_water_equivalent"]
df.drop(columns=drop_cols, inplace=True)


In [42]:
# maybe_drop = ["snow_depth"] # missing_count: 88273
# df.drop(columns=maybe_drop, inplace=True)

In [43]:
# maybe_drop = ["boundary_layer_height", "total_column_integrated_water_vapour"] # missing_count: 4375
# df.drop(columns=maybe_drop, inplace=True)

In [21]:
# Checking missing value
missing = df.isnull().sum()
missing_percent = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    "missing_count": missing,
    "missing_%": missing_percent
})
missing_df = missing_df[missing_df["missing_count"] > 0].sort_values(by="missing_count", ascending=False)
print("\n[INFO] Missing values:")
print(missing_df)


[INFO] Missing values:
                                      missing_count  missing_%
snow_depth                                    88259      11.80
boundary_layer_height                          4368       0.58
total_column_integrated_water_vapour           4368       0.58


In [22]:
# Điền phần thiếu nhỏ bằng nội suy tuyến tính hoặc giá trị gần nhất
df.set_index("date", inplace=True)
df.interpolate(method="time", limit_direction="both", inplace=True)
df.ffill(inplace=True)
df.bfill(inplace=True)
print("[INFO] Số dòng còn thiếu: ", df.isnull().sum().sum())


[INFO] Số dòng còn thiếu:  0


In [None]:
drop_cols = ["weather_code"]
df.drop(columns=drop_cols, inplace=True)

In [24]:
df.reset_index(inplace=True)
df["date"] = pd.to_datetime(df["date"])
df["month"] = df["date"].dt.month

def month_to_season(month):
    if month in [12, 1, 2]:
        return 4 # Winter
    elif month in [3, 4, 5]:
        return 1 # Spring
    elif month in [6, 7, 8]:
        return 2 # Summer
    elif month in [9, 10, 11]:
        return 3 # Autumn

df["season"] = df["month"].apply(month_to_season)

df.drop(columns="month", inplace=True)

df['season_sin'] = np.sin(2 * np.pi * df['season'] / 4)
df['season_cos'] = np.cos(2 * np.pi * df['season'] / 4)
df = df.drop('season', axis=1)

# Đưa cột season về vị trí sau date
cols = df.columns.tolist()
cols.insert(1, cols.pop(cols.index("season_sin")))
cols.insert(2, cols.pop(cols.index("season_cos")))
df = df[cols]

In [25]:
wind_direction_10m_rad = np.deg2rad(df["wind_direction_10m"])

df["wind_direction_10m_sin"] = np.sin(wind_direction_10m_rad)
df["wind_direction_10m_cos"] = np.cos(wind_direction_10m_rad)

wind_direction_100m_rad = np.deg2rad(df["wind_direction_100m"])

df["wind_direction_100m_sin"] = np.sin(wind_direction_100m_rad)
df["wind_direction_100m_cos"] = np.cos(wind_direction_100m_rad)

pos_10m = df.columns.get_loc("wind_direction_10m")
pos_100m = df.columns.get_loc("wind_direction_100m")

df.insert(pos_10m + 1, "wind_direction_10m_sin", df.pop("wind_direction_10m_sin"))
df.insert(pos_10m + 2, "wind_direction_10m_cos", df.pop("wind_direction_10m_cos"))

df.insert(pos_100m + 1, "wind_direction_100m_sin", df.pop("wind_direction_100m_sin"))
df.insert(pos_100m + 2, "wind_direction_100m_cos", df.pop("wind_direction_100m_cos"))

df.drop(columns=["wind_direction_10m", "wind_direction_100m"], inplace=True)

In [26]:
# Save a copy to a new file for cleaning
output_path = Path("../../data/processed/clean_hourly_weather.csv")
df.to_csv(output_path, index=False)
print(f"[INFO] Copy saved to: {output_path.resolve()}")

[INFO] Copy saved to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\processed\clean_hourly_weather.csv
