In [0]:
#Esto es para que la librería se actualice automáticamente (el .py de los includes en este caso), en producción hay que tomar la decisión si debería quitarse para evitar cualquier problema y hacer un restart si se cambia la librería o si dejarlo y cuando se haga un cambio, lo va a tomar automáticamente
%load_ext autoreload
%autoreload 2

# To disable autoreload; run %autoreload 0

In [0]:
%pip install msal requests #PENDING esto debería ir en el cluster
%pip install boto3 #PENDING esto debería ir en el cluster

In [0]:
from msal import ConfidentialClientApplication
from datetime import datetime
from io import StringIO, BytesIO
from pyspark.sql import SparkSession

import requests
import os
import json
import boto3
import sys
import pandas as pd

sys.path.append("/Workspace/BI-OVC")

#esto es funciona porque existe un __init__.py dentro de includes y de config
from includes.file_functions import check_filename, imprimir 
from includes import control_functions
import config.config as cfg


In [0]:
spark = SparkSession.builder \
    .appName("bi-ovc-test") \
    .getOrCreate()

In [0]:
#LOG START

process_setup_name = 'Load BI OVC'
process_setup_step_name = 'sharepoint to s3'
sys_modified_by_name = 'NBK - Load Finance - SP to S3'
source_system_code = 'SAP1C'

process_run_id = control_functions.log_process_run_start(process_setup_name,process_setup_step_name,source_system_code,sys_modified_by_name)

In [0]:
#GET DATA FROM CONTROL TABLE
df = control_functions.get_process_setup_parameters(process_setup_name,process_setup_step_name)

sourceFileNamePrefix = df.select('process_setup_source_file_name').collect()[0][0]
sourceFileExtension = df.select('process_setup_source_file_extension').collect()[0][0]
sourceFileNameMask = df.select('process_setup_source_file_name_mask').collect()[0][0]
raw_bucket = df.select('process_setup_target_bucket_name').collect()[0][0]
bucketFolderKey = df.select('process_setup_target_bucket_folder_key').collect()[0][0]


In [0]:
# ======================
# 1. Configuración
# ======================

tenant_id = dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "tenant_id")
client_id = dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "client_id")
client_secret = dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "client_secret")

hostname = cfg.hostname
site_relative = cfg.site_relative

# ======================
# 2. Obtener token (Bearer)
# ======================
authority = f"https://login.microsoftonline.com/{tenant_id}"
scope = ["https://graph.microsoft.com/.default"]

app = ConfidentialClientApplication(client_id, authority=authority, client_credential=client_secret)
token = app.acquire_token_for_client(scopes=scope)

if "access_token" not in token: 
    raise Exception(f"Error getting token: {token}") 
access_token = token["access_token"]
headers = {"Authorization": f"Bearer {access_token}"}

# ======================
# 3. Obtener siteId
# ======================

url_site = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_relative}"
r = requests.get(url_site, headers=headers)

#print(json.dumps(r, indent=2))

site = r.json()
site_id = site.get("id")

# ======================
# 4. Obtener drives y elegir "Documents"
# ======================
url_drives = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
drives_response = requests.get(url_drives, headers=headers).json()

if 'value' not in drives_response:
    raise Exception(f"Error fetching drives: {drives_response}")
drives = drives_response

drive_id = next(d["id"] for d in drives["value"] if d["name"] == cfg.sp_file_folder)

# ======================
# 5. Obtener archivos en carpeta
# ======================

folder = "BI_OVC_TST" #PENDING charlar dejar acá o más arriba, porque solo se usa aquí
url_children = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{folder}:/children"
resp = requests.get(url_children, headers=headers).json()

file_prefix = sourceFileNamePrefix
files = [f for f in resp.get("value", []) if f.get("name", "").lower().startswith(file_prefix.lower())]


In [0]:
# ===============================================
# 6. Obtener archivos
# ===============================================

files_matched = []

#Only get files matching the mask
for f in files:
    name = f.get("name", "")

    if check_filename(sourceFileNameMask,sourceFileNamePrefix.lower(),sourceFileExtension.lower(),name.lower()):
        files_matched.append(f)
    else:
        continue

In [0]:
# ======================
# 7. Mover de Sharepoint a S3
# ======================

s3_bucket = raw_bucket 
s3_base_path = bucketFolderKey

logs = []

# Inicializar cliente de S3

# Obtiene la región por defecto del entorno
session = boto3.session.Session()

s3_client = boto3.client(
    "s3",
    aws_access_key_id=dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "aws_access_key_id"),
    aws_secret_access_key=dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "aws_secret_access_key"),
    region_name=session.region_name
)

for f in files_matched:
    file_name = f.get("name")
    download_url = f.get("@microsoft.graph.downloadUrl")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    if not file_name or not download_url:
        print(f"⚠️ FILE NOT LOADED - PLEASE CHECK: {file_name}")
        continue

    target_file_name = f"{file_name.rsplit('.',1)[0]}.{file_name.rsplit('.',1)[1]}" #nombre original
    target_s3_key = f"{s3_base_path}/{target_file_name}"

    try:
        # Descargar archivo en memoria
        response = requests.get(download_url)
        response.raise_for_status()
        file_bytes = response.content

        # Subir a S3 directamente como binario
        s3_client.put_object(Bucket=s3_bucket, Key=target_s3_key, Body=file_bytes)
        print(f"✅ File Loaded: s3://{s3_bucket}/{target_s3_key}")

    except Exception as e:
        print(f"❌ ERROR LOADING FILE {file_name}: {e}")


In [0]:
#LOG END
control_functions.log_process_run_end(process_run_id,sys_modified_by_name)