# --- HÜCRE 1: Konfigürasyon Yükleme ---

In [3]:
%run nb_functions

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 23, Finished, Available, Finished)

nb_config Loaded
nb_config Loaded
nb_logging Loaded


# Logger başlat

In [4]:
logger = get_logger(__name__)
logger.info("Blok 1: Parameters ve Functions yüklendi.")

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 24, Finished, Available, Finished)

[INFO] __main__ - Blok 1: Parameters ve Functions yüklendi.


# --- HÜCRE 2: Parametreler ---

In [5]:
FULL_LOAD = False  # True ise her şeyi silip baştan yazar, False ise üzerine yazar (Merge)
_SOURCE = "smart_city"

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 25, Finished, Available, Finished)

# Config - Kaynak ve Hedef

In [6]:
source_configs = {
    "energy_config": {
        "source": {"src_directory": "energy"},
        "simplify_structure": [
            {"method": "explode", "col": "Prices"},
            {"method": "flatten", "col": "Prices"}
        ],
        "datatype_mapping": {"price": "double", "readingdate": "timestamp"},
        "columns_to_select": ["_id", "_source", "_datetime_import", "readingdate", "price"], 
        "primary_col": {
            "primary_col_name": "_id",
            "lokaal_id_cols": ["readingdate"], 
            "objecttype": "energy_price"
        },
        "sink": {"sink_directory": "energy"}
    },
    "weather_config": {
        "source": {"src_directory": "weather"},
        "simplify_structure": [
            {"method": "zip_explode", "parent": "hourly", "children": ["time", "temperature_2m", "wind_speed_10m", "direct_radiation"]}
        ],
        "datatype_mapping": {"temperature_2m": "double", "wind_speed_10m": "double", "direct_radiation": "double", "time": "timestamp"},
        "columns_to_select": ["_id", "_source", "_datetime_import", "time", "temperature_2m", "wind_speed_10m", "direct_radiation"],
        "primary_col": {
            "primary_col_name": "_id",
            "lokaal_id_cols": ["time"],
            "objecttype": "weather_forecast"
        },
        "sink": {"sink_directory": "weather"}
    },
    "air_quality_config": {
        "source": {"src_directory": "quality"},
        "simplify_structure": [
            {"method": "zip_explode", "parent": "hourly", "children": ["time", "carbon_monoxide", "nitrogen_dioxide", "pm10"]}
        ],
        "datatype_mapping": {"carbon_monoxide": "double", "pm10": "double", "time": "timestamp"},
        "columns_to_select": ["_id", "_source", "_datetime_import", "time", "carbon_monoxide", "nitrogen_dioxide", "pm10"],
        "primary_col": {
            "primary_col_name": "_id",
            "lokaal_id_cols": ["time"],
            "objecttype": "air_quality"
        },
        "sink": {"sink_directory": "quality"}
    }
}

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 26, Finished, Available, Finished)

# --- HÜCRE 3: Extract (Okuma) ---

In [7]:
logger.info("Blok 2: Extract Başlıyor")
dataframe_collection = {}

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 27, Finished, Available, Finished)

[INFO] __main__ - Blok 2: Extract Başlıyor


In [8]:
for name, config in source_configs.items():
    # nb_config'den gelen PATH_BRONZE kullanılıyor
    src_path = f"{PATH_BRONZE}/{config['source']['src_directory']}"
    
    try:
        # Klasör dolu mu kontrol etmeden önce okumayı dene, hata verirse yakala
        # multiline=True JSON dizileri için önemlidir
        df = spark.read.format("json").option("multiline", "true").load(src_path)
        
        if df.count() > 0:
            dataframe_collection[name] = df
            logger.info(f"{name}: {df.count()} kayıt okundu. Yol: {src_path}")
        else:
            logger.warning(f"{name}: Dosya okundu ama boş.")
            
    except Exception as e:
        logger.error(f"{name}: Okuma hatası. Yol: {src_path}", str(e))

if not dataframe_collection:
    mssparkutils.notebook.exit("Hiçbir veri okunamadı.")

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 28, Finished, Available, Finished)

[INFO] __main__ - energy_config: 1 kayıt okundu. Yol: abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/bronze/smart_city/energy
[INFO] __main__ - weather_config: 1 kayıt okundu. Yol: abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/bronze/smart_city/weather
[INFO] __main__ - air_quality_config: 1 kayıt okundu. Yol: abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/bronze/smart_city/quality


# --- HÜCRE 4: Transform ---

In [9]:
logger.info("Blok 3: Transform Başlıyor")
transformed_collection = {}

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 29, Finished, Available, Finished)

[INFO] __main__ - Blok 3: Transform Başlıyor


In [10]:
for name, df in dataframe_collection.items():
    config = source_configs[name]
    logger.info(f"Transforming: {name}")
    
    # 1. Structure Simplification (Explode/Flatten)
    if 'simplify_structure' in config:
        for step in config['simplify_structure']:
            if step["method"] == "explode":
                df = explode_col(df, step["col"])
            elif step["method"] == "flatten":
                df = flatten_struct_col(df, step["col"])
            elif step["method"] == "zip_explode":
                df = zip_explode_and_flatten(df, step["parent"], step["children"])
    
    # 2. Datatype Casting
    if 'datatype_mapping' in config:
        df = set_datatype(df, config['datatype_mapping'])

    # 3. Clean Column Names
    df = clean_column_names(df)
    
    # 4. Meta Columns & Primary Key
    if 'primary_col' in config:
        pk_config = config['primary_col']
        # Lokaal ID oluştur
        cols_to_concat = [col(c).cast("string") for c in pk_config["lokaal_id_cols"]]
        df = df.withColumn("lokaal_id_temp", concat_ws("_", *cols_to_concat))
        
        # Hash ID oluştur
        df = df.withColumn(
            pk_config["primary_col_name"], 
            hash_column(lit(pk_config["objecttype"]), lit(_SOURCE), col("lokaal_id_temp"))
        ).drop("lokaal_id_temp")
    
    # Meta data
    df = df.withColumn(SOURCE_COL_NAME, lit(_SOURCE))
    # Import date yoksa şu anı ekle
    if IMPORTDATE_COL_NAME not in df.columns:
        df = df.withColumn(IMPORTDATE_COL_NAME, current_timestamp())

    # 5. Column Selection
    if 'columns_to_select' in config:
        df = select_columns_safe(df, config['columns_to_select'])
        
    transformed_collection[name] = df

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 30, Finished, Available, Finished)

[INFO] __main__ - Transforming: energy_config
[INFO] __main__ - Transforming: weather_config
[INFO] __main__ - Transforming: air_quality_config


# --- HÜCRE 5: Load (Yazma) ---

In [11]:
logger.info("Blok 4: Load Başlıyor (Silver Layer)")

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 31, Finished, Available, Finished)

[INFO] __main__ - Blok 4: Load Başlıyor (Silver Layer)


In [12]:
# Delta Schema Merge özelliğini aç (Yeni kolon gelirse tabloyu günceller)
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 32, Finished, Available, Finished)

In [13]:
for name, df in transformed_collection.items():
    config = source_configs[name]
    
    # Hedef Yol: Files/Silver/smart_city/Energy vb.
    sink_dir = config['sink']['sink_directory']
    sink_path = f"{PATH_SILVER}/{sink_dir}"
    
    pk_name = config['primary_col']['primary_col_name']
    
    logger.info(f"Yazılıyor: {name} -> {sink_path}")
    
    # nb_functions içindeki gelişmiş load fonksiyonunu kullanıyoruz
    load_data_into_delta_table(
        data=df,
        sink_path=sink_path,
        full_load=FULL_LOAD,
        primary_col_name=pk_name
    )
    
    # Opsiyonel: Tabloyu Lakehouse'a kaydet (Files üzerine External Table)
    table_name = f"silver_{sink_dir.lower()}"
    create_lakehouse_table(table_name, sink_path)

logger.info("ETL Süreci Tamamlandı: Bronze -> Silver")

StatementMeta(, d25e2afa-55d9-42d0-b99f-31b6dd14be71, 33, Finished, Available, Finished)

[INFO] __main__ - Yazılıyor: energy_config -> abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/silver/smart_city/energy
SUCCESS: Merged into abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/silver/smart_city/energy
Table 'silver_energy' registered from location 'abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/silver/smart_city/energy'
[INFO] __main__ - Yazılıyor: weather_config -> abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/silver/smart_city/weather
SUCCESS: Merged into abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/silver/smart_city/weather
Table 'silver_weather' registered from location 'abfss://smart_city@onelake.dfs.fabric.microsoft.com/smart_city_lakehouse.Lakehouse/Files/silver/smart_city/weather'
[INFO] __main__ - Yazılıyor: air_quality_config -> abfss://smart_city@onelake.d