In [None]:
import pandas as pd
import os
from google.colab import drive

drive.mount('/content/drive')



In [None]:
PROVINCE_MAP = {
    "hanoi": 1,
    "hue": 2,
    "laichau": 3,
    "dienbien": 4,
    "sonla": 5,
    "langson": 6,
    "quangninh": 7,
    "thanhhoa": 8,
    "nghean": 9,
    "hatinh": 10,
    "caobang": 11,
    "tuyenquang": 12,
    "hagiang": 13,
    "laocai": 14,
    "yenbai": 15,
    "thainguyen": 16,
    "backan": 17,
    "phutho": 18,
    "vinhphuc": 19,
    "hoabinh": 20,
    "bacninh": 21,
    "bacgiang": 22,
    "hungyen": 23,
    "thaibinh": 24,
    "haiphong": 25,
    "haiduong": 26,
    "ninhbinh": 27,
    "hanam": 28,
    "namdinh": 29,
    "quangtri": 30,
    "quangbinh": 31,
    "danang": 32,
    "quangnam": 33,
    "quangngai": 34,
    "kontum": 35,
    "gialai": 36,
    "binhdinh": 37,
    "khanhhoa": 38,
    "ninhthuan": 39,
    "lamdong": 40,
    "daknong": 41,
    "binhthuan": 42,
    "daklak": 43,
    "phuyen": 44,
    "hochiminh": 45,
    "binhduong": 46,
    "vungtau": 47,
    "dongnai": 48,
    "binhphuoc": 49,
    "tayninh": 50,
    "longan": 51,
    "cantho": 52,
    "soctrang": 53,
    "haugiang": 54,
    "vinhlong": 55,
    "bentre": 56,
    "travinh": 57,
    "dongthap": 58,
    "tiengiang": 59,
    "camau": 60,
    "baclieu": 61,
    "angiang": 62,
    "kiengiang": 63,
}


In [None]:
import pandas as pd
import os

FOLDER_PATH = "/content/drive/MyDrive/DATN/data/daily"   # Adjusted to point to the processed daily data
OUTPUT_PATH = "/content/drive/MyDrive/vietnam_weather_daily_latlong.csv"

all_dfs = []

for file in os.listdir(FOLDER_PATH):
    if not file.endswith(".csv"): # Ensure only CSVs are processed
        continue

    file_path = os.path.join(FOLDER_PATH, file)
    df = pd.read_csv(file_path)

    # Chuẩn hoá tên cột thời gian
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"])
        df = df.rename(columns={"time": "date"})

    # The 'province_id' mapping is no longer needed as latitude/longitude are in the files
    # We also do not add 'province_id' as a column here based on the user's latest request.

    all_dfs.append(df)

# Gộp tất cả tỉnh
merged_df = pd.concat(all_dfs, ignore_index=True)

# Sắp xếp cho đúng time series (removed province_id from sort_values)
merged_df = merged_df.sort_values(["date", "latitude", "longitude"]).reset_index(drop=True)

# Remove 'province_id' if it was added previously (this handles cases where it might still exist)
if 'province_id' in merged_df.columns:
    merged_df = merged_df.drop(columns=['province_id'])

# Lưu file
merged_df.to_csv(OUTPUT_PATH, index=False)

print("✅ Gộp dữ liệu 63 tỉnh thành công")
print("Shape:", merged_df.shape)

✅ Gộp dữ liệu 63 tỉnh thành công
Shape: (115101, 23)


In [None]:
merged_df.head()

Unnamed: 0,date,temperature_2m_mean (°C),temperature_2m_max (°C),temperature_2m_min (°C),apparent_temperature_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),dew_point_2m_mean (°C),precipitation_sum (mm),rain_sum (mm),...,wind_gusts_10m_mean (km/h),wind_speed_10m_mean (km/h),winddirection_10m_dominant (°),surface_pressure_mean (hPa),pressure_msl_mean (hPa),daylight_duration (s),sunshine_duration (s),weather_code (wmo code),latitude,longitude
0,2020-01-01,27.9,32.9,23.9,28.6,33.5,24.9,18.1,0.0,0.0,...,34.0,17.1,70,1013.3,1014.1,41751.15,38584.04,3,9.173989,105.12998
1,2020-01-01,27.5,32.0,23.4,28.1,32.7,24.6,18.5,0.0,0.0,...,38.0,19.1,66,1013.7,1014.2,41721.83,38335.0,3,9.314587,105.703125
2,2020-01-01,26.8,30.8,22.2,27.8,31.6,24.1,18.8,0.0,0.0,...,33.4,16.7,67,1013.3,1014.3,41663.12,38320.22,3,9.595782,105.993034
3,2020-01-01,26.9,31.7,22.3,28.3,33.3,24.1,18.7,0.0,0.0,...,28.0,14.3,73,1013.8,1014.3,41648.43,38215.91,3,9.66608,105.53619
4,2020-01-01,26.5,30.2,21.8,28.3,32.1,24.4,19.7,0.0,0.0,...,29.4,15.5,70,1013.7,1014.3,41589.61,38011.42,3,9.947276,106.37795


In [None]:
merged_df.groupby("province_id").size().head()
merged_df.isna().sum().sort_values(ascending=False).head()

Unnamed: 0,0
date,0
temperature_2m_mean (°C),0
temperature_2m_max (°C),0
temperature_2m_min (°C),0
apparent_temperature_mean (°C),0


In [None]:
import numpy as np

merged_df["dayofyear"] = merged_df["date"].dt.dayofyear
merged_df["sin_doy"] = np.sin(2 * np.pi * merged_df["dayofyear"] / 365)
merged_df["cos_doy"] = np.cos(2 * np.pi * merged_df["dayofyear"] / 365)


In [None]:
merged_df.head()

Unnamed: 0,date,temperature_2m_mean (°C),temperature_2m_max (°C),temperature_2m_min (°C),apparent_temperature_mean (°C),apparent_temperature_max (°C),apparent_temperature_min (°C),dew_point_2m_mean (°C),precipitation_sum (mm),rain_sum (mm),...,winddirection_10m_dominant (°),surface_pressure_mean (hPa),pressure_msl_mean (hPa),daylight_duration (s),sunshine_duration (s),weather_code (wmo code),province_id,dayofyear,sin_doy,cos_doy
0,2020-01-01,19.9,23.0,18.3,21.6,23.9,20.1,17.6,4.3,4.3,...,138,1021.7,1023.7,39149.71,21243.86,55,1,1,0.017213,0.999852
1,2020-01-02,21.1,25.4,18.5,23.0,27.0,20.6,18.3,0.6,0.6,...,141,1020.0,1022.1,39167.91,11812.2,51,1,2,0.034422,0.999407
2,2020-01-03,21.5,25.4,19.8,23.6,27.4,21.7,19.2,1.4,1.4,...,126,1018.1,1020.1,39187.71,12373.59,51,1,3,0.05162,0.998667
3,2020-01-04,21.1,23.5,19.4,22.8,25.3,21.2,19.2,4.3,4.3,...,121,1016.2,1018.2,39209.06,8332.28,53,1,4,0.068802,0.99763
4,2020-01-05,21.5,25.6,19.0,23.0,26.5,20.5,19.2,2.6,2.6,...,131,1015.0,1017.0,39231.95,22847.37,53,1,5,0.085965,0.996298
