In [0]:
#Esto es para que la librería se actualice automáticamente (el .py de los includes en este caso), en producción hay que tomar la decisión si debería quitarse para evitar cualquier problema y hacer un restart si se cambia la librería o si dejarlo y cuando se haga un cambio, lo va a tomar automáticamente
%load_ext autoreload
%autoreload 2

# To disable autoreload; run %autoreload 0

In [0]:
%pip install msal requests #PENDING esto debería ir en el cluster
%pip install boto3 #PENDING esto debería ir en el cluster

In [0]:
from msal import ConfidentialClientApplication
from datetime import datetime
from io import StringIO, BytesIO
from pyspark.sql import SparkSession

import requests
import os
import json
import boto3
import sys
import pandas as pd

sys.path.append("/Workspace/BI-OVC")

#esto es funciona porque existe un __init__.py dentro de includes y de config
from includes.file_functions import check_filename, imprimir 
from includes import control_functions


In [0]:
spark = SparkSession.builder \
    .appName("bi-ovc-test") \
    .getOrCreate()

In [0]:
#GENERIC PARAMETER
sys_status_column_name = 'sys_status_code'
error_status_code = 'E'
fn_status = True
process_source_name = 'Load BI OVC'
process_step_name = 'sharepoint to s3'
sys_modified_by_name = 'NBK - Load Finance - SP to S3'
source_system_code = 'SAPBR'


In [0]:
#LOG START
process_run_id, fn_status = control_functions.log_process_run_start(process_source_name,process_step_name,source_system_code,sys_modified_by_name)

In [0]:
#IF errors, set row status to E
if process_run_id > 0:
  control_functions.log_process_run_update_value(process_run_id,sys_modified_by_name,sys_status_column_name, error_status_code)

if fn_status == False:
  print(f"❌ ERROR STARTING PROCESS")
  raise 

In [0]:
#FILE PARAMETERS FROM NOTEBOOK
source_file_extension = 'txt'
source_file_name_mask = '{fileName}_{yyyymmdd}_{hhmmss}.{fileExtension}'

In [0]:
#GET DATA FROM CONTROL TABLE
try:
    df = control_functions.get_process_source_parameters(process_source_name,process_step_name)
except Exception as e:
    control_functions.log_process_run_update_value(process_run_id,sys_modified_by_name,sys_status_column_name, error_status_code)
    print(f"❌ ERROR GETTING PROCESS SETUP PARAMETERS: {e}")
    raise 


In [0]:
#PARAMETERS FROM CONTROL TABLE
source_file_name_prefix = df.select('source_file_name_prefix').collect()[0][0]

sharepoint_hostname = df.select('source_sharepoint_host_name').collect()[0][0]
source_sharepoint_site_relative = df.select('source_sharepoint_site_relative').collect()[0][0]
source_sharepoint_drive = df.select('source_sharepoint_drive').collect()[0][0]
source_sharepoint_file_path = df.select('source_sharepoint_file_path').collect()[0][0]

raw_bucket = df.select('target_bucket_name').collect()[0][0]
bucketFolderKey = df.select('target_bucket_folder_key').collect()[0][0]

In [0]:
# ======================
# 1. Configuration
# ======================

tenant_id = dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "tenant_id")
client_id = dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "client_id")
client_secret = dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "client_secret")

# ======================
# 2. Get token (Bearer)
# ======================
authority = f"https://login.microsoftonline.com/{tenant_id}"
scope = ["https://graph.microsoft.com/.default"]

app = ConfidentialClientApplication(client_id, authority=authority, client_credential=client_secret)
token = app.acquire_token_for_client(scopes=scope)

if "access_token" not in token: 
    raise Exception(f"Error getting token: {token}") 
access_token = token["access_token"]
headers = {"Authorization": f"Bearer {access_token}"}

# ======================
# 3. Get siteId
# ======================

url_site = f"https://graph.microsoft.com/v1.0/sites/{sharepoint_hostname}:/{source_sharepoint_site_relative}"
r = requests.get(url_site, headers=headers)

#print(json.dumps(r, indent=2))

site = r.json()
site_id = site.get("id")

# =========================================================================
# 4. Get drives and pick the requested one (source_sharepoint_drive)
# =========================================================================

url_drives = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
drives_response = requests.get(url_drives, headers=headers).json()

if 'value' not in drives_response:
    raise Exception(f"Error fetching drives: {drives_response}")
drives = drives_response

drive_id = next(d["id"] for d in drives["value"] if d["name"] == source_sharepoint_drive)

# =======================
# 5. Get files in Folder
# =======================

url_children = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{source_sharepoint_file_path}:/children"
resp = requests.get(url_children, headers=headers).json()

file_prefix = source_file_name_prefix
files = [f for f in resp.get("value", []) if f.get("name", "").lower().startswith(file_prefix.lower())]


In [0]:
# ===============================================
# 6. Get files matching the mask
# ===============================================

files_matched = []

#Only get files matching the mask
for f in files:
    name = f.get("name", "")

    if check_filename(source_file_name_mask,source_file_name_prefix.lower(),source_file_extension.lower(),name.lower()):
        files_matched.append(f)
    else:
        continue

In [0]:
# ================================
# 7. Move from Sharepoint to S3
# ================================

s3_bucket = raw_bucket 
s3_base_path = bucketFolderKey

logs = []

# Initialize S3 client

# Get environment region
session = boto3.session.Session()

s3_client = boto3.client(
    "s3",
    aws_access_key_id=dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "aws_access_key_id"),
    aws_secret_access_key=dbutils.secrets.get(scope = "ibs-sharepoint-databricks-secret", key = "aws_secret_access_key"),
    region_name=session.region_name
)

for f in files_matched:
    file_name = f.get("name")
    download_url = f.get("@microsoft.graph.downloadUrl")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    if not file_name or not download_url:
        print(f"⚠️ FILE NOT LOADED - PLEASE CHECK: {file_name}")
        continue

    target_file_name = f"{file_name.rsplit('.',1)[0]}.{file_name.rsplit('.',1)[1]}" #original name
    target_s3_key = f"{s3_base_path}/{target_file_name}"

    try:
        # Download file in memory
        response = requests.get(download_url)
        response.raise_for_status()
        file_bytes = response.content

        # Upload to S3 as binary
        s3_client.put_object(Bucket=s3_bucket, Key=target_s3_key, Body=file_bytes)
        print(f"✅ File Loaded: s3://{s3_bucket}/{target_s3_key}")

    except Exception as e:
        print(f"❌ ERROR LOADING FILE {file_name}: {e}")


In [0]:
#LOG END
x = control_functions.log_process_run_end(process_run_id,sys_modified_by_name)

In [0]:
#IF errors, set row status to E
if x == False:
  control_functions.log_process_run_update_value(process_run_id,sys_modified_by_name,sys_status_column_name, error_status_code)
