In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit
import joblib

def agrupar_frames(df, window_size=50, step=25):
    """
    Agrupa los frames en ventanas de tamaño window_size, avanzando en pasos de 'step',
    calculando para cada ventana y para cada columna numérica (excepto 'video', 'frame' y 'organ'):
        - Media (mean)
        - Máximo (max)
        - Mínimo (min)
        - Rango (range = max - min)
        - Asimetría (skewness)
        - Desviación estándar (std)
    
    Parámetros:
        df (pd.DataFrame): DataFrame con las columnas, incluyendo 'video', 'frame' y 'organ'.
        window_size (int): Tamaño de la ventana (por defecto 50).
        step (int): Tamaño del salto entre ventanas (por defecto 25).
    
    Retorna:
        pd.DataFrame: Nuevo DataFrame con columnas 'video', 'organ', 'frame_start', 'frame_end'
                      y las estadísticas calculadas para cada ventana.
    """
    filas_resultado = []
    columnas_calcular = [col for col in df.columns if col not in ["video", "frame", "organ"]]
    
    for (video, organ), grupo in df.groupby(["video", "organ"]):
        grupo = grupo.sort_values("frame")
        n = len(grupo)
        
        for start in range(0, n, step):
            ventana = grupo.iloc[start:start + window_size]
            # Si la ventana tiene menos de 5 frames, se salta
            if len(ventana) < 5:
                continue
        
            frame_start = ventana["frame"].iloc[0]
            frame_end = ventana["frame"].iloc[-1]
            resultado = {"video": video, "organ": organ,
                         "frame_start": frame_start, "frame_end": frame_end}
        
            for col in columnas_calcular:
                serie = ventana[col]
                resultado[f"mean_{col}"] = serie.mean()
                resultado[f"max_{col}"] = serie.max()
                resultado[f"min_{col}"] = serie.min()
                resultado[f"range_{col}"] = serie.max() - serie.min()
                resultado[f"skew_{col}"] = serie.skew()
                resultado[f"std_{col}"] = serie.std()
        
            filas_resultado.append(resultado)

    
    return pd.DataFrame(filas_resultado)


def guardar_csv(datos, ruta, index=False):
    """
    Guarda un DataFrame como archivo CSV en la ruta indicada.

    Parámetros:
    ----------
    datos : cualquier cosa (se espera un DataFrame)
        Objeto a guardar como CSV.
    ruta : str
        Ruta completa del archivo CSV (incluyendo el nombre del archivo).
    index : bool, opcional
        Si se desea guardar el índice del DataFrame (por defecto False).

    Retorna:
    -------
    None
    """
    ruta_csv = Path(ruta)
    ruta_csv.parent.mkdir(parents=True, exist_ok=True)
    datos.to_csv(ruta_csv, index=index)
    print(f"Archivo guardado en: {ruta_csv.resolve()}")


In [None]:
BASE_DIR = Path.cwd().parent.parent

df_frame_path_50 = BASE_DIR / 'data_storage' / 'csv_by_frame' / 'Frame-data-50.csv'
df_frame_path_60 = BASE_DIR / 'data_storage' / 'csv_by_frame' / 'Frame-data-60.csv'

df_frame_50 = pd.read_csv(df_frame_path_50)
df_frame_60 = pd.read_csv(df_frame_path_60)

target_path = BASE_DIR / 'data_storage' / 'target_csv' / 'target_labels_clean.csv'
df_target = pd.read_csv(target_path)

In [5]:
df_50 = agrupar_frames(df_frame_50)
df_60 = agrupar_frames(df_frame_60)

In [9]:
guardar_csv(df_50, BASE_DIR / 'data_storage' / 'csv_by_window' / 'Data-50.csv')
guardar_csv(df_60, BASE_DIR / 'data_storage' / 'csv_by_window' / 'Data-60.csv')

Archivo guardado en: C:\Users\javie\Universidad\tfg\pop-detection-ml\data_storage\csv_by_window\Data-50.csv
Archivo guardado en: C:\Users\javie\Universidad\tfg\pop-detection-ml\data_storage\csv_by_window\Data-60.csv


In [None]:
#[cystocele, cystourethrocele, uterine_prolapse, cervical_elongation, rectocele, any_prolapse]

def agregar_columna(df, df_prolapse, columna):
    """
    Agrega a df la columna 'columna' proveniente de df_prolapse, 
    realizando el merge entre df['video'] y df_prolapse['case'].
    Convierte los valores booleanos a enteros (False=0, True=1).

    Parámetros:
      df: DataFrame principal (tiene la columna 'video').
      df_prolapse: DataFrame con la columna 'case' y las columnas de interés.
      columna: Nombre de la columna a agregar (ej. 'any_prolapse').

    Retorna:
      df con la nueva columna añadida.
    """
    if columna not in df_prolapse.columns:
        raise ValueError(f"La columna '{columna}' no se encuentra en df_prolapse.")
    
    df_aux = df_prolapse[['case', columna]].copy()
    df_aux[columna] = df_aux[columna].astype(int)
    
    df_merged = df.merge(df_aux, left_on='video', right_on='case', how='left')
    
    df_merged.drop(columns='case', inplace=True)
    
    return df_merged




In [10]:
df_any_prolapse_50 = agregar_columna(df_50, df_target, 'any_prolapse')
df_any_prolapse_60 = agregar_columna(df_60, df_target, 'any_prolapse')
df_cystocele_50 = agregar_columna(df_50, df_target, 'cystocele')
df_cystocele_60 = agregar_columna(df_60, df_target, 'cystocele')
df_cystourethrocele_50 = agregar_columna(df_50, df_target, 'cystourethrocele')
df_cystourethrocele_60 = agregar_columna(df_60, df_target, 'cystourethrocele')
df_uterine_prolapse_50 = agregar_columna(df_50, df_target, 'uterine_prolapse')
df_uterine_prolapse_60 = agregar_columna(df_60, df_target, 'uterine_prolapse')
df_rectocele_50 = agregar_columna(df_50, df_target, 'rectocele')
df_rectocele_60 = agregar_columna(df_60, df_target, 'rectocele')
df_cervical_elongation_50 = agregar_columna(df_50, df_target, 'cervical_elongation')
df_cervical_elongation_60 = agregar_columna(df_60, df_target, 'cervical_elongation')

In [None]:
def obtener_videos_sin_prolapso(df):
    """
    Obtiene los valores únicos de la columna 'video' donde 'any_prolapse' es NaN.
    
    :param df: DataFrame de Pandas.
    :return: Lista de valores únicos de la columna 'video'.
    """
    return df[df['any_prolapse'].isna()]['video'].unique()

def eliminar_videos(df, lista_videos):
    """
    Elimina del DataFrame las filas cuyo valor en la columna 'video' esté en la lista proporcionada
    y guarda el resultado en un archivo CSV.
    
    :param df: DataFrame de Pandas.
    :param lista_videos: Lista de valores a eliminar en la columna 'video'.
    :param archivo_salida: Nombre del archivo CSV de salida (por defecto 'videos_filtrados.csv').
    :return: DataFrame sin las filas cuyo 'video' esté en la lista.
    """
    df_filtrado = df[~df['video'].isin(lista_videos)].reset_index(drop=True)
    
    return df_filtrado


In [12]:
eliminar = obtener_videos_sin_prolapso(df_any_prolapse_50)
df_any_prolapse_50 = eliminar_videos(df_any_prolapse_50, eliminar)
df_any_prolapse_60 = eliminar_videos(df_any_prolapse_60, eliminar)
df_cystocele_50 = eliminar_videos(df_cystocele_50, eliminar)
df_cystocele_60 = eliminar_videos(df_cystocele_60, eliminar)
df_cystourethrocele_50 = eliminar_videos(df_cystourethrocele_50, eliminar)
df_cystourethrocele_60 = eliminar_videos(df_cystourethrocele_60, eliminar)
df_uterine_prolapse_50 = eliminar_videos(df_uterine_prolapse_50, eliminar)
df_uterine_prolapse_60 = eliminar_videos(df_uterine_prolapse_60, eliminar)
df_rectocele_50 = eliminar_videos(df_rectocele_50, eliminar)
df_rectocele_60 = eliminar_videos(df_rectocele_60, eliminar)
df_cervical_elongation_50 = eliminar_videos(df_cervical_elongation_50, eliminar)
df_cervical_elongation_60 = eliminar_videos(df_cervical_elongation_60, eliminar)

In [17]:
def best_seed_train_test_split(df, target_col="any_prolapse", n_pruebas=600, test_size=0.2):
    
    video_labels = (df.groupby("video")[target_col].first().reset_index())

    seeds = np.arange(1, n_pruebas + 1)
    best_seed = None
    best_diff = float("inf")
    best_train_videos = None
    best_test_videos = None

    for seed in seeds:
        sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)


        for train_index, test_index in sss.split(video_labels["video"], video_labels[target_col]):
            train_videos = video_labels.loc[train_index, "video"]
            test_videos = video_labels.loc[test_index, "video"]

        df_train_temp = df[df["video"].isin(train_videos)]
        df_test_temp = df[df["video"].isin(test_videos)]

        train_ratio = df_train_temp[target_col].value_counts(normalize=True).sort_index()
        test_ratio = df_test_temp[target_col].value_counts(normalize=True).sort_index()

        diff = np.sum(np.abs(train_ratio - test_ratio))

        if diff < best_diff:
            best_diff = diff
            best_seed = seed
            best_train_videos = train_videos
            best_test_videos = test_videos

    df_train = df[df["video"].isin(best_train_videos)]
    df_test = df[df["video"].isin(best_test_videos)]

    print(f"Mejor seed encontrada")

    return df_train, df_test, best_seed

In [18]:
df_train_any_prolapse_50, df_test_any_prolapse_50, chosen_seed_any_prolapse_50 = best_seed_train_test_split(df_any_prolapse_50, target_col="any_prolapse", n_pruebas=600, test_size=0.2)
df_train_any_prolapse_60, df_test_any_prolapse_60, chosen_seed_any_prolapse_60 = best_seed_train_test_split(df_any_prolapse_60, target_col="any_prolapse", n_pruebas=600, test_size=0.2)
df_train_cystocele_50, df_test_cystocele_50, chosen_seed_cystocele_50 = best_seed_train_test_split(df_cystocele_50, target_col="cystocele", n_pruebas=600, test_size=0.2)
df_train_cystocele_60, df_test_cystocele_60, chosen_seed_cystocele_60 = best_seed_train_test_split(df_cystocele_60, target_col="cystocele", n_pruebas=600, test_size=0.2)
df_train_cystourethrocele_50, df_test_cystourethrocele_50, chosen_seed_cystourethrocele_50 = best_seed_train_test_split(df_cystourethrocele_50, target_col="cystourethrocele", n_pruebas=600, test_size=0.2)
df_train_cystourethrocele_60, df_test_cystourethrocele_60, chosen_seed_cystourethrocele_60 = best_seed_train_test_split(df_cystourethrocele_60, target_col="cystourethrocele", n_pruebas=600, test_size=0.2)
df_train_uterine_prolapse_50, df_test_uterine_prolapse_50, chosen_seed_uterine_prolapse_50 = best_seed_train_test_split(df_uterine_prolapse_50, target_col="uterine_prolapse", n_pruebas=600, test_size=0.2)
df_train_uterine_prolapse_60, df_test_uterine_prolapse_60, chosen_seed_uterine_prolapse_60 = best_seed_train_test_split(df_uterine_prolapse_60, target_col="uterine_prolapse", n_pruebas=600, test_size=0.2)
df_train_rectocele_50, df_test_rectocele_50, chosen_seed_rectocele_50 = best_seed_train_test_split(df_rectocele_50, target_col="rectocele", n_pruebas=600, test_size=0.2)
df_train_rectocele_60, df_test_rectocele_60, chosen_seed_rectocele_60 = best_seed_train_test_split(df_rectocele_60, target_col="rectocele", n_pruebas=600, test_size=0.2)
df_train_cervical_elongation_50, df_test_cervical_elongation_50, chosen_seed_cervical_elongation_50 = best_seed_train_test_split(df_cervical_elongation_50, target_col="cervical_elongation", n_pruebas=600, test_size=0.2)
df_train_cervical_elongation_60, df_test_cervical_elongation_60, chosen_seed_cervical_elongation_60 = best_seed_train_test_split(df_cervical_elongation_60, target_col="cervical_elongation", n_pruebas=600, test_size=0.2)


Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada
Mejor seed encontrada


In [21]:

joblib.dump((df_train_any_prolapse_50, df_test_any_prolapse_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'any_prolapse'/ 'train_test_split.pkl')
joblib.dump((df_train_any_prolapse_60, df_test_any_prolapse_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'any_prolapse'/ 'train_test_split_60.pkl')
joblib.dump((df_train_cystocele_50, df_test_cystocele_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystocele'/ 'train_test_split.pkl')
joblib.dump((df_train_cystocele_60, df_test_cystocele_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystocele'/ 'train_test_split_60.pkl')
joblib.dump((df_train_cystourethrocele_50, df_test_cystourethrocele_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystourethrocele'/ 'train_test_split.pkl')
joblib.dump((df_train_cystourethrocele_60, df_test_cystourethrocele_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystourethrocele'/ 'train_test_split_60.pkl')
joblib.dump((df_train_uterine_prolapse_50, df_test_uterine_prolapse_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'uterine_prolapse'/ 'train_test_split.pkl')
joblib.dump((df_train_uterine_prolapse_60, df_test_uterine_prolapse_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'uterine_prolapse'/ 'train_test_split_60.pkl')
joblib.dump((df_train_rectocele_50, df_test_rectocele_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'rectocele'/ 'train_test_split.pkl')
joblib.dump((df_train_rectocele_60, df_test_rectocele_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'rectocele'/ 'train_test_split_60.pkl')
joblib.dump((df_train_cervical_elongation_50, df_test_cervical_elongation_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cervical_elongation'/ 'train_test_split.pkl')
joblib.dump((df_train_cervical_elongation_60, df_test_cervical_elongation_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cervical_elongation'/ 'train_test_split_60.pkl')


['c:\\Users\\javie\\Universidad\\tfg\\pop-detection-ml\\data_storage\\train_test_splits\\cervical_elongation\\train_test_split_60.pkl']

In [22]:
def separar_X_y_groups(df, target_col, columns_to_drop=None, group_col="video"):
    if columns_to_drop is None:
        columns_to_drop = []

    all_cols_to_drop = set(columns_to_drop + [target_col])
    
    y = df[target_col]
    
    groups = df[group_col]
    
    X = df.drop(columns=all_cols_to_drop, errors='ignore')
    
    return X, y, groups

In [23]:
cols_to_remove = ["video", "frame_start", "frame_end"]
X_train_any_prolapse_50, y_train_any_prolapse_50, groups_train_any_prolapse_50 = separar_X_y_groups(df_train_any_prolapse_50,target_col="any_prolapse",columns_to_drop=cols_to_remove,group_col="video")
X_train_any_prolapse_60, y_train_any_prolapse_60, groups_train_any_prolapse_60 = separar_X_y_groups(df_train_any_prolapse_60,target_col="any_prolapse",columns_to_drop=cols_to_remove,group_col="video")
X_train_cystocele_50, y_train_cystocele_50, groups_train_cystocele_50 = separar_X_y_groups(df_train_cystocele_50,target_col="cystocele",columns_to_drop=cols_to_remove,group_col="video")
X_train_cystocele_60, y_train_cystocele_60, groups_train_cystocele_60 = separar_X_y_groups(df_train_cystocele_60,target_col="cystocele",columns_to_drop=cols_to_remove,group_col="video")
X_train_cystourethrocele_50, y_train_cystourethrocele_50, groups_train_cystourethrocele_50 = separar_X_y_groups(df_train_cystourethrocele_50,target_col="cystourethrocele",columns_to_drop=cols_to_remove,group_col="video")
X_train_cystourethrocele_60, y_train_cystourethrocele_60, groups_train_cystourethrocele_60 = separar_X_y_groups(df_train_cystourethrocele_60,target_col="cystourethrocele",columns_to_drop=cols_to_remove,group_col="video")
X_train_uterine_prolapse_50, y_train_uterine_prolapse_50, groups_train_uterine_prolapse_50 = separar_X_y_groups(df_train_uterine_prolapse_50,target_col="uterine_prolapse",columns_to_drop=cols_to_remove,group_col="video")
X_train_uterine_prolapse_60, y_train_uterine_prolapse_60, groups_train_uterine_prolapse_60 = separar_X_y_groups(df_train_uterine_prolapse_60,target_col="uterine_prolapse",columns_to_drop=cols_to_remove,group_col="video")
X_train_rectocele_50, y_train_rectocele_50, groups_train_rectocele_50 = separar_X_y_groups(df_train_rectocele_50,target_col="rectocele",columns_to_drop=cols_to_remove,group_col="video")
X_train_rectocele_60, y_train_rectocele_60, groups_train_rectocele_60 = separar_X_y_groups(df_train_rectocele_60,target_col="rectocele",columns_to_drop=cols_to_remove,group_col="video")
X_train_cervical_elongation_50, y_train_cervical_elongation_50, groups_train_cervical_elongation_50 = separar_X_y_groups(df_train_cervical_elongation_50,target_col="cervical_elongation",columns_to_drop=cols_to_remove,group_col="video")
X_train_cervical_elongation_60, y_train_cervical_elongation_60, groups_train_cervical_elongation_60 = separar_X_y_groups(df_train_cervical_elongation_60,target_col="cervical_elongation",columns_to_drop=cols_to_remove,group_col="video")


In [24]:
joblib.dump((X_train_any_prolapse_50, y_train_any_prolapse_50, groups_train_any_prolapse_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'any_prolapse'/ 'train_data.pkl')
joblib.dump((X_train_any_prolapse_60, y_train_any_prolapse_60, groups_train_any_prolapse_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'any_prolapse'/ 'train_data_60.pkl')
joblib.dump((X_train_cystocele_50, y_train_cystocele_50, groups_train_cystocele_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystocele'/ 'train_data.pkl')
joblib.dump((X_train_cystocele_60, y_train_cystocele_60, groups_train_cystocele_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystocele'/ 'train_data_60.pkl')
joblib.dump((X_train_cystourethrocele_50, y_train_cystourethrocele_50, groups_train_cystourethrocele_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystourethrocele'/ 'train_data.pkl')
joblib.dump((X_train_cystourethrocele_60, y_train_cystourethrocele_60, groups_train_cystourethrocele_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cystourethrocele'/ 'train_data_60.pkl')
joblib.dump((X_train_uterine_prolapse_50, y_train_uterine_prolapse_50, groups_train_uterine_prolapse_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'uterine_prolapse'/ 'train_data.pkl')
joblib.dump((X_train_uterine_prolapse_60, y_train_uterine_prolapse_60, groups_train_uterine_prolapse_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'uterine_prolapse'/ 'train_data_60.pkl')
joblib.dump((X_train_rectocele_50, y_train_rectocele_50, groups_train_rectocele_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'rectocele'/ 'train_data.pkl')
joblib.dump((X_train_rectocele_60, y_train_rectocele_60, groups_train_rectocele_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'rectocele'/ 'train_data_60.pkl')
joblib.dump((X_train_cervical_elongation_50, y_train_cervical_elongation_50, groups_train_cervical_elongation_50), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cervical_elongation'/ 'train_data.pkl')
joblib.dump((X_train_cervical_elongation_60, y_train_cervical_elongation_60, groups_train_cervical_elongation_60), BASE_DIR / 'data_storage'  / 'train_test_splits' / 'cervical_elongation'/ 'train_data_60.pkl')



['c:\\Users\\javie\\Universidad\\tfg\\pop-detection-ml\\data_storage\\train_test_splits\\cervical_elongation\\train_data_60.pkl']