## Librerias

In [None]:
!pip install kubernetes
!pip install pandas
!pip install python-dotenv


In [1]:
from kubernetes import client, config
from kubernetes.stream import stream
import pandas as pd
import re
import subprocess
import requests
import urllib3
import os
import json
from dotenv import load_dotenv
#deshabilita alertas
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
pd.set_option('display.max_colwidth', False)

## Funciones Helpers

In [14]:
def get_bearer_token():
    return os.environ["USER_ACCESS_TOKEN"]

In [9]:
def is_valid_numeric_id(value):
    if isinstance(value, int):
        return value >= 0
        if isinstance(value, str):
        return value.isdigit()
        return False

In [12]:
def get_project_name(project_id):
    command = ["./cpdctl", "project", "get", "--project-id", project_id]

    try:
        result = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Error ejecutando cpdctl: {e.stderr.strip()}") from e

    for line in result.stdout.splitlines():
        if line.startswith("Name:"):
            return line.split(":", 1)[1].strip()
    raise RuntimeError("Project name not found in cpdctl output.")

In [36]:
_space_name_cache = {}

def get_space_name(space_id):

    if space_id in _space_name_cache:
        return _space_name_cache[space_id]
    command = ["./cpdctl", "space", "get", "--space-id", space_id]
    try:
        result = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Error executing cpdctl for space id {space_id}: {e.stderr.strip()}") from e

    space_name = None
    for line in result.stdout.splitlines():
        if line.startswith("Name:"):
            space_name = line.split(":", 1)[1].strip()
            break
    if not space_name:
        raise RuntimeError(f"Space name not found in cpdctl output for space id {space_id}")

    _space_name_cache[space_id] = space_name
    return space_name

def add_space_name_column(df, space_id_column="Espacio"):
    df["Nombre de Espacio"] = df[space_id_column].apply(get_space_name)
    return df

In [6]:
def get_projects_data(bearer_token):
    zen_watchdog_url = f"{cpd_url}/zen-watchdog/v3/resources/workloads/policies/pretty/projects"
    headers = {
        'Authorization': f'Bearer {bearer_token}',
        'Accept': 'application/json'
    }
    response = requests.get(zen_watchdog_url, headers=headers, verify=False)
    response.raise_for_status()
    return response.json()

project_data = get_projects_data(bearer_token)
projects = [
    {"ID": item["ID"], "Name": item["Name"]}
    for item in project_data["requestObj"]["Items"]
]
projects

[{'ID': '35aafd3b-069f-4f30-b5ce-08a1f092eb52', 'Name': 'project_54'},
 {'ID': '8710e5ad-a59e-48d3-a546-0ac9ebfbcc69', 'Name': 'project_50'},
 {'ID': '9e4de2c0-0377-4082-85e4-08c7162c6ad9', 'Name': 'project_58'},
 {'ID': 'a629fb97-8860-4d46-a6ae-ae46bedafbbb', 'Name': 'project_88'},
 {'ID': 'a81e36d6-63e9-4e88-8f97-e87db245107d', 'Name': 'project_32'},
 {'ID': 'ddcb9037-9f3b-4b2c-bd09-42d6b1fe03a3', 'Name': 'project_95'},
 {'ID': '3c294018-bc65-4b0e-a1ce-516519d77eed', 'Name': 'project_30'},
 {'ID': '5f0228ef-d11a-43c9-a9ce-c8eda0b5f803', 'Name': 'project_39'},
 {'ID': '5fc5af1b-c1bd-4d0c-b3da-5ef301541cbe', 'Name': 'project_17'},
 {'ID': '6e1d142b-f6cc-4900-b187-67f6edcddc2d', 'Name': 'project_19'},
 {'ID': 'c2e7c933-f713-4572-93f3-58620e867e22', 'Name': 'project_96'},
 {'ID': 'f2264801-bfc2-4bf7-90ba-66320b9d0f66', 'Name': 'project_4'},
 {'ID': '6ef508cc-526e-40d6-8b1e-6ef3c190f4e5', 'Name': 'project_26'},
 {'ID': '3ebe4eeb-a5ed-4f82-a170-21cece29a3ce', 'Name': 'project_38'},
 {'ID':

In [7]:
# Obtener username en base a user_id
def get_username(user_id, bearer_token):
    user_mgmt_url = f"{cpd_instance_url}/usermgmt/v1/usermgmt/user/{user_id}?any_status=true&include_session_info=true"
    headers = {'Authorization': f'Bearer {bearer_token}', 'Accept': 'application/json'}
    response = requests.get(user_mgmt_url, headers=headers, verify=False)
    response.raise_for_status()
    user_data = response.json()
    return user_data.get('username', 'Unknown')

## Configuración cpdctl

In [None]:
!curl -LOs "https://github.com/IBM/cpdctl/releases/latest/download/cpdctl_linux_amd64.tar.gz"
!tar -xzf cpdctl_linux_amd64.tar.gz
!rm -f cpdctl_linux_amd64.tar.gz

In [5]:
cpd_url="https://cpd-cpd.apps.cp4d-education-install.cp.fyre.ibm.com/"
bearer_token=os.getenv("USER_ACCESS_TOKEN")

## Configuración kubeconfig

In [2]:
# Cargar kubeconfig
config.load_kube_config("mlops_kubeconfig.yaml")
core_v1_api = client.CoreV1Api()
namespace = "cpd"  # Reemplazar
label_selector = "app=asset-files-api"
base_paths = ["/mnt/asset_file_api/projects",]
# Temporales y salida final
output_file_intermediate = "file_sizes.csv"
output_file_with_names = "file_sizes_with_names.csv"
output_file_final = "data_proyectos.csv"



pods = core_v1_api.list_namespaced_pod(namespace, label_selector=label_selector)
if not pods.items:
    raise RuntimeError("No se encontro el pod")

pod_name = pods.items[0].metadata.name
print(f"Usando pod: {pod_name}")

Usando pod: asset-files-api-5f9df8d656-tk7bd


## Data de Spaces

In [37]:
pods = core_v1_api.list_namespaced_pod(namespace, label_selector=label_selector)
if not pods.items:
    raise RuntimeError("No se encontró el pod")

pod_name = pods.items[0].metadata.name
print(f"Usando pod: {pod_name}")
spaces_base_dir = "/mnt/asset_file_api/spaces/"


list_files_command = (
    "list_files() { "
    "  for entry in \"$1\"/*; do "
    "    if [ -d \"$entry\" ]; then "
    "      list_files \"$entry\"; "
    "    elif [ -f \"$entry\" ]; then "
    "      size=$(stat -c \"%s\" \"$entry\"); "
    "      size_kb=$(( (size + 1023) / 1024 )); "
    "      echo \"$entry $size_kb\"; "
    "    fi; "
    "  done; "
    "}; "
    "list_files " + spaces_base_dir
)

resp = stream(
    core_v1_api.connect_get_namespaced_pod_exec,
    name=pod_name,
    namespace=namespace,
    command=["/bin/sh", "-c", list_files_command],
    stderr=True, stdin=False,
    stdout=True, tty=False
)


file_data = []
for line in resp.split("\n"):
    parts = line.strip().rsplit(" ", 1)  
    if len(parts) == 2:
        file_path, file_size = parts
        path_parts = [p for p in file_path.split("/") if p]
        space_id = path_parts[3] if len(path_parts) > 3 else "Desconocido"
        file_data.append([space_id, file_path, int(file_size)])

spaces = pd.DataFrame(file_data, columns=["Espacio", "Ruta", "Tamaño (KB)"])
spaces=add_space_name_column(spaces)


Usando pod: asset-files-api-5f9df8d656-tk7bd


In [38]:
spaces

Unnamed: 0,Espacio,Ruta,Tamaño (KB),Nombre de Espacio
0,85b29280-73c5-4aad-881c-4387695eadba,/mnt/asset_file_api/spaces//85b29280-73c5-4aad-881c-4387695eadba/assets/09653c11-bfe7-4f57-b089-a5ea0a997fb4/4c573e5e-fa4c-4619-b154-2d97a0590c27/381e15c0-550d-4921-b398-d1cfcf58f4e2,105,Fyre_space
1,85b29280-73c5-4aad-881c-4387695eadba,/mnt/asset_file_api/spaces//85b29280-73c5-4aad-881c-4387695eadba/assets/data_asset/credentials.txt,1,Fyre_space


In [45]:
spaces.to_csv("data_espacios.csv")

## Obtencion data de proyectos

In [46]:
# Main logic
cpd_instance_url = cpd_url
try:
    all_file_details = []
    for path in base_paths:
        print(f"Procesando ruta: {path}")

        command = [
            "bash", "-c",
            f"shopt -s nullglob; for d in {path}/*/*/* {path}/*/*/.[!.]*; do if [ -e \"$d\" ]; then du -k \"$d\"; fi; done"
        ]

        # Ejecutar en pod
        response = stream(
            core_v1_api.connect_get_namespaced_pod_exec,
            pod_name,
            namespace,
            command=command,
            stderr=True, stdin=False, stdout=True, tty=False,
        )

        for line in response.splitlines():
            match = re.match(r"^(\S+)\s+(.+)$", line)
            if match:
                file_size, file_path = match.groups()
                parts = file_path.split("/")
                if "projects" in path and len(parts) >= 6:
                    project_id = parts[4]
                    user_id = parts[5]
                    if is_valid_numeric_id(user_id):
                        file_name = parts[-1]
                        all_file_details.append({
                            "id": project_id,
                            "user_id": user_id,
                            "file_name": file_name,
                            "file_size": file_size,
                            "path": file_path  # New column with full file path
                        })
                    else:
                        print(f"Invalido id de usuario: {user_id}, omitiendo.")

    df = pd.DataFrame(all_file_details)
    print(f"Se guardaron {len(df)} registros.")
    df.to_csv(output_file_intermediate, index=False)
    print(f"Data de paso guaradada en {output_file_intermediate}")

    df["proyecto"] = df.apply(
        lambda row: get_project_name(row["id"]),
        axis=1
    )

    bearer_token = get_bearer_token()
    df["username"] = df["user_id"].apply(lambda user_id: get_username(user_id, bearer_token))

    df.to_csv(output_file_final, index=False)
    print(f"Data final guardada en {output_file_final}")

except Exception as e:
    print(f"Ocurrio un error: {e}")

if os.path.exists('file_sizes.csv'):
    os.remove('file_sizes.csv')
    print(f"Se borro el archivo temporal file_sizes.csv ")
else:
    print(f"file_sizes.csv no existe.")


Procesando ruta: /mnt/asset_file_api/projects
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de usuario: jupyterlab, omitiendo.
Invalido id de

In [47]:
proyectos=pd.read_csv("data_proyectos.csv")

In [48]:
proyectos

Unnamed: 0,id,user_id,file_name,file_size,path,proyecto,username
0,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,ask_pass.sh,1,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/ask_pass.sh,galicia-project,jlara
1,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,.METADATA,0,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/assets/.METADATA,galicia-project,jlara
2,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,assets,0,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/assets,galicia-project,jlara
3,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,assettypes,1,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/assettypes,galicia-project,jlara
4,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,info,0,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/.git/objects/info,galicia-project,jlara
5,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,pack,34045,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/.git/objects/pack,galicia-project,jlara
6,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,objects,34045,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/.git/objects,galicia-project,jlara
7,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,hooks,29,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/.git/hooks,galicia-project,jlara
8,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,heads,1,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/.git/logs/refs/heads,galicia-project,jlara
9,61795412-76fb-4a86-be1e-71fa41300fef,1000331005,origin,1,/mnt/asset_file_api/projects/61795412-76fb-4a86-be1e-71fa41300fef/1000331005/git/.git/logs/refs/remotes/origin,galicia-project,jlara
