In [13]:
# Pre-processing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import os
print("[INFO] Current working directory:", os.getcwd())
# Display configuration
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Read data
file_path = Path("../../data/raw/openmeteo_historical_daily.csv")
df = pd.read_csv(file_path, parse_dates=["date"])
print("[INFO] Data loaded:", df.shape)

[INFO] Current working directory: c:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\notebooks\pre_process
[INFO] Data loaded: (14976, 67)


In [14]:
# Xoá cột không có dữ liệu
drop_cols = ["cape_mean", "cape_max", "cape_min",
             "precipitation_probability_min", "precipitation_probability_mean",
             "updraft_max", "visibility_mean", "visibility_min",
             "visibility_max", "soil_moisture_0_to_10cm_mean"]
df.drop(columns=drop_cols, inplace=True)


In [15]:
# Checking missing value
missing = df.isnull().sum()
missing_percent = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    "missing_count": missing,
    "missing_%": missing_percent
})
missing_df = missing_df[missing_df["missing_count"] > 0].sort_values(by="missing_count", ascending=False)
print("\n[INFO] Missing values:")
print(missing_df)


[INFO] Missing values:
Empty DataFrame
Columns: [missing_count, missing_%]
Index: []


In [16]:
print("[INFO] Số dòng còn thiếu: ", df.isnull().sum().sum())


[INFO] Số dòng còn thiếu:  0


In [17]:
drop_cols = ["weather_code", "sunset", "sunrise"]
df.drop(columns=drop_cols, inplace=True)

In [18]:
df["date"] = pd.to_datetime(df["date"])


In [19]:
df["month"] = df["date"].dt.month


In [20]:
def month_to_season(month):
    if month in [12, 1, 2]:
        return 4 # Winter
    elif month in [3, 4, 5]:
        return 1 # Spring
    elif month in [6, 7, 8]:
        return 2 # Summer
    elif month in [9, 10, 11]:
        return 3 # Autumn

df["season"] = df["month"].apply(month_to_season)


In [21]:
df.drop(columns="month", inplace=True)


In [22]:
import numpy as np


df['season_sin'] = np.sin(2 * np.pi * df['season'] / 4)
df['season_cos'] = np.cos(2 * np.pi * df['season'] / 4)
df = df.drop('season', axis=1)

In [23]:
# Đưa cột season về vị trí sau date
cols = df.columns.tolist()
cols.insert(1, cols.pop(cols.index("season_sin")))
cols.insert(2, cols.pop(cols.index("season_cos")))
df = df[cols]


In [24]:
# Save a copy to a new file for cleaning
output_path = Path("../../data/processed/clean_daily_weather.csv")
df.to_csv(output_path, index=False)
print(f"[INFO] Copy saved to: {output_path.resolve()}")

[INFO] Copy saved to: C:\Users\ADMIN\MyProject\School_Projects\WeatherForecastML-DM\data\processed\clean_daily_weather.csv
