# Goals
* Menyamakan fitur dari tiap-tiap sumber adapun semua fitur yang telah diseleksi adalah sebagai berikut :
1. `Temperature` (C)
2. `Apparent Temperature / feels like` (C)
3. `Precipitation` (mm)
4. `Dewpoint` (C)
5. `Humidity` (%)
6. `Cloud` (Total)
7. `wind speed` (km/h)
8. `wind degree` (km/h)
9. `wind gust` (km/h)

* Menyamakan satuan tiap fitur, adapun yang perlu dilakukan :
1. Pada weatherapi.com ada pilihan field untuk satuan yang berbeda, yang perlu dilakukan hanya seleksi
2. Pada open-meteo juga ada pilihan query untuk memilih satuan, tidak perlu dilakukan pengubahan satuan ukur
3. Pada openweathermap.com tidak ada pilihan untuk mengubah satuan, yang perlu diganti adalah pada semua fitur `wind`

* Melakukan aggregasi

In [103]:
import os
import sys
import requests

from pyspark.sql import DataFrame
from dotenv import load_dotenv
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, row_number, lit, avg

In [41]:
# Load api key
load_dotenv()
FREEWEATHER_KEY = os.getenv("FREEWEATHER_KEY")
OPENWEATHERMAP_KEY = os.getenv("OPENWEATHERMAP_KEY")

# Tambahkan 2 baris ini agar spark mengenali env python
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
# Buat spark session terlebih dahulu
spark = SparkSession.builder \
    .appName("rdv-project") \
    .master("local[*]") \
    .getOrCreate()

In [None]:
"""
Malang sendiri sebenarnya tidak memiliki stasiun pengukuran dari BMKG
Oleh karena itu latitude dan longitude di bawah ini menunjukkan 
koordinat dari kota malang seluruhnya
"""
# --------------------------- Parameter API -------------------------

# Latitude & Longitude utk Malang
locations = {
    "Klojen":(-7.969421375342755, 112.6285308513895),
    "Blimbing":(-7.945926995201971, 112.64310740385885),
    "Sukun":(-7.987926536974757, 112.6100743302227),
    "Lowokwaru":(-7.9348455775950795, 112.60665573821122),
    "Lawang":(-7.828716329007881, 112.70181673602366),
    "Singosari":(-7.875350536265517, 112.64837233274164),
    "Kepanjen":(-8.11761411293624, 112.57906911889476),
    "Pakis":(-7.958180248694484, 112.71010117082125),
    "Wagir":(-7.980111689103741, 112.49397160866646),
    "Tumpang":(-8.007270828630284, 112.74768804369178)
}

# List Fitur untuk open-meteo
feature_list = [
    "temperature_2m",
    "relative_humidity_2m",
    "precipitation",
    "wind_speed_10m",
    "wind_direction_10m",
    "wind_gusts_10m",
    "apparent_temperature",
    "cloud_cover",
    "dew_point_2m"
]
features = ",".join(feature_list)

# Timestamp hari ini
today = datetime.today()
today_str = today.strftime("%Y-%m-%d")
timestamp_today = today.timestamp()

In [24]:
weatherapi_psdf = None

for district in locations.keys() :
    latitude = locations[district][0]
    longitude = locations[district][1]

    # recent weather url call for weatherapi
    weatherapi_url = f"http://api.weatherapi.com/v1/current.json?key={FREEWEATHER_KEY}&q={latitude},{longitude}"

    success = False
    attempts = 0
    
    # openmeteo call
    while not success and attempts < 3:
        try:
            print(f"Getting recent data for {district}")
            response = requests.get(weatherapi_url)
            if response.status_code == 200:
                data = response.json()['current']
                num_records = len(data)
                rows = [data]
                psdf = spark.read.json(spark.sparkContext.parallelize(rows))
                psdf = psdf.withColumn("district", lit(district)) \
                        .withColumn("latitude", lit(latitude)) \
                        .withColumn("longitude", lit(longitude))

                if weatherapi_psdf is None:
                    weatherapi_psdf = psdf
                else:
                    weatherapi_psdf = weatherapi_psdf.unionByName(psdf)

                if "_corrupt_record" in weatherapi_psdf.columns:
                    weatherapi_psdf = weatherapi_psdf.drop("_corrupt_record")
                
                success = True

            else :
                print(f"Can't fetch data, status code: {response.status_code}")
                print(f"Retrying... (Attempts {attempts+1}/3)")
                attempts += 1

        except Exception as e:
            print(f"Error while getting recent data for {district}. Message: {e}. Retrying...")
            print(f"Retrying... (Attempts {attempts+1}/3)")
            attempts += 1

Getting recent data for Klojen
Getting recent data for Blimbing
Getting recent data for Sukun
Getting recent data for Lowokwaru
Getting recent data for Lawang
Getting recent data for Singosari
Getting recent data for Kepanjen
Getting recent data for Pakis
Getting recent data for Wagir
Getting recent data for Tumpang


In [25]:
weatherapi_psdf.show()

+-----+--------------------+----------+----------+-----------+-----------+--------+--------+-----------+-----------+--------+------+----------------+------------------+---------+---------+-----------+-----------+------+------+---+------+---------+-----------+--------+--------+--------+-----------+-----------+---------+-------------------+------------------+
|cloud|           condition|dewpoint_c|dewpoint_f|feelslike_c|feelslike_f|gust_kph|gust_mph|heatindex_c|heatindex_f|humidity|is_day|    last_updated|last_updated_epoch|precip_in|precip_mm|pressure_in|pressure_mb|temp_c|temp_f| uv|vis_km|vis_miles|wind_degree|wind_dir|wind_kph|wind_mph|windchill_c|windchill_f| district|           latitude|         longitude|
+-----+--------------------+----------+----------+-----------+-----------+--------+--------+-----------+-----------+--------+------+----------------+------------------+---------+---------+-----------+-----------+------+------+---+------+---------+-----------+--------+--------+---

In [30]:
openmeteo_psdf = None

for district in locations.keys() :
    latitude = locations[district][0]
    longitude = locations[district][1]
    today = datetime.today()
    end_date = today.strftime("%Y-%m-%d")
    start_date = end_date

    # recent weather url call for openmeteo
    openmeteo_url = f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&hourly={features}&start_date={start_date}&end_date={end_date}"

    success = False
    attempts = 0

    # openmeteo call
    while not success and attempts < 3:
        try:
            print(f"Getting recent data for {district}")
            response = requests.get(openmeteo_url)
            if response.status_code == 200:
                data = response.json()['hourly']
                num_records = len(data['time'])
                rows = [
                    {key: data[key][i] for key in data}
                    for i in range(num_records)
                ]
                psdf = spark.read.json(spark.sparkContext.parallelize(rows))
                psdf = psdf.withColumn("district", lit(district)) \
                        .withColumn("latitude", lit(latitude)) \
                        .withColumn("longitude", lit(longitude))

                if openmeteo_psdf is None:
                    openmeteo_psdf = psdf
                else:
                    openmeteo_psdf = openmeteo_psdf.unionByName(psdf)

                if "_corrupt_record" in psdf.columns:
                    psdf = psdf.drop("_corrupt_record")
                
                success = True

            else :
                print(f"Can't fetch data, status code: {response.status_code}")
                print(f"Retrying... (Attempts {attempts+1}/3)")
                attempts += 1

        except Exception as e:
            print(f"Error while getting recent data for {district}. Message: {e}. Retrying...")
            print(f"Retrying... (Attempts {attempts+1}/3)")
            attempts += 1

Getting recent data for Klojen
Getting recent data for Blimbing
Getting recent data for Sukun
Getting recent data for Lowokwaru
Getting recent data for Lawang
Getting recent data for Singosari
Getting recent data for Kepanjen
Getting recent data for Pakis
Getting recent data for Wagir
Getting recent data for Tumpang


In [32]:
openmeteo_psdf.show()

+--------------------+-----------+------------+-------------+--------------------+--------------+----------------+------------------+--------------+--------------+--------+------------------+-----------------+
|apparent_temperature|cloud_cover|dew_point_2m|precipitation|relative_humidity_2m|temperature_2m|            time|wind_direction_10m|wind_gusts_10m|wind_speed_10m|district|          latitude|        longitude|
+--------------------+-----------+------------+-------------+--------------------+--------------+----------------+------------------+--------------+--------------+--------+------------------+-----------------+
|                27.5|         93|        22.1|          0.0|                  95|          23.0|2025-05-21T00:00|               297|           9.7|           3.2|  Klojen|-7.969421375342755|112.6285308513895|
|                29.4|        100|        22.1|          0.0|                  84|          25.0|2025-05-21T01:00|               241|           8.3|           3

In [83]:
openweathermap_psdf = None

for district in locations.keys():
    latitude = locations[district][0]
    longitude = locations[district][1]
    
    openweathermap_url = f"https://api.openweathermap.org/data/2.5/weather?lat={latitude}&lon={longitude}&appid={OPENWEATHERMAP_KEY}&units=metric"

    success = False
    attempts = 0

    while not success and attempts < 3:
        try:
            print(f"Getting recent data for {district}")
            response = requests.get(openweathermap_url)
            if response.status_code == 200:
                data = response.json()

                flat_weather = {
                    "temperature_c": data['main'].get("temp"),
                    "feels_like_c": data['main'].get("feels_like"),
                    "humidity_pct": data['main'].get("humidity"),
                    "cloud_total_pct": data['clouds'].get("all"),
                    "wind_speed_kmph": round(data['wind'].get("speed", 0) * 3.6, 2),
                    "wind_deg": data['wind'].get("deg"),
                    "wind_gust_kmph": round(data['wind'].get("gust", 0) * 3.6, 2)
                }

                psdf = spark.read.json(spark.sparkContext.parallelize([flat_weather]))
                psdf = psdf.withColumn("district", lit(district)) \
                           .withColumn("latitude", lit(latitude)) \
                           .withColumn("longitude", lit(longitude))

                if openweathermap_psdf is None:
                    openweathermap_psdf = psdf
                else:
                    openweathermap_psdf = openweathermap_psdf.unionByName(psdf)

                if "_corrupt_record" in openweathermap_psdf.columns:
                    openweathermap_psdf = openweathermap_psdf.drop("_corrupt_record")

                success = True

            else:
                print(f"Can't fetch data, status code: {response.status_code}")
                print(f"Retrying... (Attempts {attempts+1}/3)")
                attempts += 1

        except Exception as e:
            print(f"Error while getting recent data for {district}. Message: {e}. Retrying...")
            attempts += 1

Getting recent data for Klojen
Getting recent data for Blimbing
Getting recent data for Sukun
Getting recent data for Lowokwaru
Getting recent data for Lawang
Getting recent data for Singosari
Getting recent data for Kepanjen
Getting recent data for Pakis
Getting recent data for Wagir
Getting recent data for Tumpang


In [82]:
openweathermap_psdf.show()

+---------------+------------+------------+-------------+--------+--------------+---------------+--------+------------------+-----------------+
|cloud_total_pct|feels_like_c|humidity_pct|temperature_c|wind_deg|wind_gust_kmph|wind_speed_kmph|district|          latitude|        longitude|
+---------------+------------+------------+-------------+--------+--------------+---------------+--------+------------------+-----------------+
|             98|       37.23|          98|        30.23|     222|          4.43|           3.89|  Klojen|-7.969421375342755|112.6285308513895|
+---------------+------------+------------+-------------+--------+--------------+---------------+--------+------------------+-----------------+



# Aggregat

In [93]:
standard_col_map = {
    # Temperature
    "temperature_2m": "temperature_c",
    "temp_c": "temperature_c",
    "temperature_c": "temperature_c",

    # Feels like
    "feelslike_c": "feels_like_c",
    "feels_like_c": "feels_like_c",

    # Humidity
    "relative_humidity_2m": "humidity_pct",
    "humidity": "humidity_pct",
    "humidity_pct": "humidity_pct",

    # Wind speed
    "wind_speed_10m": "wind_speed_kmph",
    "wind_kph": "wind_speed_kmph",
    "wind_speed_kmph": "wind_speed_kmph",

    # Wind gust
    "wind_gusts_10m": "wind_gust_kmph",
    "gust_kph": "wind_gust_kmph",
    "wind_gust_kmph": "wind_gust_kmph",

    # Wind degree
    "wind_direction_10m": "wind_deg",
    "wind_degree": "wind_deg",
    "wind_deg": "wind_deg",

    # Cloud
    "cloud_cover": "cloud_total_pct",
    "cloud": "cloud_total_pct",
    "cloud_total_pct": "cloud_total_pct",

    # Lokasi
    "district": "district",
    "latitude": "latitude",
    "longitude": "longitude"
}

In [94]:
def standardize_columns(df, col_map):
    renamed_cols = [col(c).alias(col_map[c]) for c in df.columns if c in col_map]
    return df.select(*renamed_cols)

In [98]:
openmeteo_psdf = standardize_columns(openmeteo_psdf, standard_col_map)
weatherapi_psdf = standardize_columns(weatherapi_psdf, standard_col_map)
openweathermap_psdf = standardize_columns(openweathermap_psdf, standard_col_map)

In [100]:
openmeteo_psdf.show()

+---------------+------------+-------------+--------+--------------+---------------+--------+------------------+-----------------+
|cloud_total_pct|humidity_pct|temperature_c|wind_deg|wind_gust_kmph|wind_speed_kmph|district|          latitude|        longitude|
+---------------+------------+-------------+--------+--------------+---------------+--------+------------------+-----------------+
|             93|          95|         23.0|     297|           9.7|            3.2|  Klojen|-7.969421375342755|112.6285308513895|
|            100|          84|         25.0|     241|           8.3|            3.7|  Klojen|-7.969421375342755|112.6285308513895|
|            100|          74|         26.9|     203|           8.3|            2.7|  Klojen|-7.969421375342755|112.6285308513895|
|            100|          68|         28.4|     163|           9.4|            3.8|  Klojen|-7.969421375342755|112.6285308513895|
|            100|          65|         29.2|     159|          15.1|            7.0

In [101]:
weatherapi_psdf.show()

+---------------+------------+--------------+------------+-------------+--------+---------------+---------+-------------------+------------------+
|cloud_total_pct|feels_like_c|wind_gust_kmph|humidity_pct|temperature_c|wind_deg|wind_speed_kmph| district|           latitude|         longitude|
+---------------+------------+--------------+------------+-------------+--------+---------------+---------+-------------------+------------------+
|             83|        28.3|          10.8|          82|         25.8|     223|            3.6|   Klojen| -7.969421375342755| 112.6285308513895|
|             83|        28.3|          10.8|          82|         25.8|     223|            3.6| Blimbing| -7.945926995201971|112.64310740385885|
|             83|        28.3|          10.8|          82|         25.8|     223|            3.6|    Sukun| -7.987926536974757| 112.6100743302227|
|             83|        28.3|          10.8|          82|         25.8|     223|            3.6|Lowokwaru|-7.93484557

In [102]:
openweathermap_psdf.show()

+---------------+------------+------------+-------------+--------+--------------+---------------+---------+-------------------+------------------+
|cloud_total_pct|feels_like_c|humidity_pct|temperature_c|wind_deg|wind_gust_kmph|wind_speed_kmph| district|           latitude|         longitude|
+---------------+------------+------------+-------------+--------+--------------+---------------+---------+-------------------+------------------+
|             98|       37.23|          98|        30.23|     222|          4.43|           3.89|   Klojen| -7.969421375342755| 112.6285308513895|
|             99|       36.99|          99|        29.99|     240|          3.71|           3.06| Blimbing| -7.945926995201971|112.64310740385885|
|             98|       36.96|          98|        29.96|     210|          4.75|           4.14|    Sukun| -7.987926536974757| 112.6100743302227|
|             98|       36.78|          98|        29.78|     224|          3.96|            3.1|Lowokwaru|-7.93484557

In [104]:
def aggregate_common_columns(dfs: list[DataFrame], group_cols: list[str] = ["district", "latitude", "longitude"]) -> DataFrame:
    """
    Menggabungkan dan mengagregasi kolom-kolom yang sama dari beberapa DataFrame PySpark.
    
    Args:
        dfs (List[DataFrame]): Daftar DataFrame PySpark.
        group_cols (List[str]): Kolom yang dijadikan dasar pengelompokan/agregasi.
    
    Returns:
        DataFrame: DataFrame hasil agregasi rata-rata kolom umum.
    """
    if len(dfs) < 2:
        raise ValueError("Minimal dua DataFrame diperlukan.")

    # 1. Ambil kolom yang sama di semua DataFrame
    common_cols = set(dfs[0].columns)
    for df in dfs[1:]:
        common_cols &= set(df.columns)
    common_cols = list(common_cols)

    if not all(col in common_cols for col in group_cols):
        raise ValueError(f"Semua group_cols ({group_cols}) harus ada di kolom umum.")

    # 2. Select hanya kolom yang sama
    dfs_selected = [df.select(common_cols) for df in dfs]

    # 3. Union semua DataFrame
    union_df = dfs_selected[0]
    for df in dfs_selected[1:]:
        union_df = union_df.unionByName(df)

    # 4. Agregasi
    value_cols = [col for col in common_cols if col not in group_cols]
    agg_exprs = [avg(col).alias(f"avg_{col}") for col in value_cols]
    result_df = union_df.groupBy(*group_cols).agg(*agg_exprs)

    return result_df

In [105]:
final_psdf = aggregate_common_columns([openmeteo_psdf, openweathermap_psdf, weatherapi_psdf])
final_psdf.show()

+---------+-------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+
| district|           latitude|         longitude|      avg_wind_deg|avg_cloud_total_pct|avg_wind_speed_kmph| avg_temperature_c| avg_humidity_pct|avg_wind_gust_kmph|
+---------+-------------------+------------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+
|   Klojen| -7.969421375342755| 112.6285308513895| 226.8846153846154|  91.38461538461539|  3.791923076923077| 25.16653846153846|85.92307692307692|  8.96653846153846|
| Blimbing| -7.945926995201971|112.64310740385885|227.57692307692307|  91.42307692307692| 3.7600000000000002|25.091923076923077|85.96153846153847| 8.938846153846153|
|    Sukun| -7.987926536974757| 112.6100743302227|226.42307692307693|  91.38461538461539| 3.8015384615384615|25.275384615384617|85.92307692307692| 8.978846153846153|
|Low