# Preprocessing
Authors: Joel Enrique Díaz Villanueva, Fernando Lopez Barbosa

Organization: Universidad de Monterrey   

Created: 28 November 2025   

---

In [None]:
import os, sys
os.chdir("C:/Users/USER/PEF/Terrain-Traversability-Analysis")
sys.path.append(os.getcwd())
import glob
from FeatureHelpers2 import new_df, is_good_frame, find_subdirectories_os
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt

## Adoquín

In [None]:
class_name = 'adoquin'

subdirectories = []
subdirectories = find_subdirectories_os(rf'c:\Users\USER\Desktop\Everything\{class_name}')

for subdir in subdirectories:
    df_shape = []
    good_frames = []
    time_stamp = []
    dataframes_dict = {}
    file_names = {} 

    carpeta = rf'c:\Users\USER\Desktop\Everything\{class_name}\{subdir}' # Global folder path
    output_dir = rf'C:\\Users\\USER\\PEF\\Terrain-Traversability-Analysis\\Preprocessing_10\\{class_name}' 

    csv_files = glob.glob(os.path.join(carpeta, '**', '*.csv'), recursive=True)

    folder_name = os.path.basename(carpeta.rstrip('\\').rstrip('_'))
    print("="*40)
    print(f"\nFolder Name: {folder_name}")
    print(f"\nFound a total of {len(csv_files)} CSV files in the directory.")

    for i in range(len(csv_files)):
        df = pl.read_csv(csv_files[i], skip_rows=1)
        if is_good_frame(df):
            good_frames.append(i)
            std_val = df['# TIMESTAMP (ns)'].std()
            time_stamp.append([std_val, i])
            df_transformed = new_df(df)
            dataframes_dict[i] = df_transformed
            df_shape.append(df_transformed.shape)
            
            file_names[i] = os.path.basename(csv_files[i])

    print(f"Approved frames: {len(good_frames)}")

    os.makedirs(output_dir, exist_ok=True)

    if good_frames:
        if len(time_stamp) > 0:
            arr = np.array(time_stamp)
            
            if arr.ndim == 1:
                arr = arr.reshape(1, -1)
            
            sorted_arr = arr[arr[:, 0].argsort()[::-1]]
            num_frames = min(10, len(sorted_arr))
            top_10_indices = sorted_arr[:num_frames, 1].astype(int)
            
            top_10_dfs = {idx: dataframes_dict[idx] for idx in top_10_indices if idx in dataframes_dict}
            
            for rank, (idx, df) in enumerate(top_10_dfs.items(), 1):
                print(f'\n--- # {rank} ---')
                print(f'Original File: {file_names[idx]}')
                print(f'Points: {df.height}')
                
                ''' # Optional Visualization (Makes debugging easier)
                plt.figure(figsize=(6, 6))
                plt.scatter(df["X"], df["Y"], s=1, c=df["Z"], cmap="viridis")
                plt.xlabel("X (m)")
                plt.ylabel("Y (m)")
                plt.title(f"# {rank} - {file_names[idx]}\n(X vs Y) colored by Z")
                plt.colorbar(label="Height Z (m)")
                plt.xlim(-3, 3)
                plt.ylim(0, 3)
                plt.grid(True, alpha=0.3)
                plt.show()
                '''
                
                output_file = f'{output_dir}/{os.path.splitext(file_names[idx])[0]}.csv'
                df.write_csv(output_file)
                print(f'Saved as: {file_names[idx]}.csv')
            
            print(f"Total: {len(top_10_dfs)} frames saved in {output_dir}")
            print("="*40)
        else:
            print("No timestamp data available to rank frames.")


Folder Name: adoquin_atras_de_rectoria

Found a total of 74 CSV files in the directory.
Approved frames: 50

--- # 1 ---
Original File: adoquin_atras_de_rectoria_61.csv
Points: 23336
Saved as: adoquin_atras_de_rectoria_61.csv.csv

--- # 2 ---
Original File: adoquin_atras_de_rectoria_2.csv
Points: 23330
Saved as: adoquin_atras_de_rectoria_2.csv.csv

--- # 3 ---
Original File: adoquin_atras_de_rectoria_72.csv
Points: 23328
Saved as: adoquin_atras_de_rectoria_72.csv.csv

--- # 4 ---
Original File: adoquin_atras_de_rectoria_33.csv
Points: 23327
Saved as: adoquin_atras_de_rectoria_33.csv.csv

--- # 5 ---
Original File: adoquin_atras_de_rectoria_15.csv
Points: 23337
Saved as: adoquin_atras_de_rectoria_15.csv.csv

--- # 6 ---
Original File: adoquin_atras_de_rectoria_6.csv
Points: 23331
Saved as: adoquin_atras_de_rectoria_6.csv.csv

--- # 7 ---
Original File: adoquin_atras_de_rectoria_46.csv
Points: 23333
Saved as: adoquin_atras_de_rectoria_46.csv.csv

--- # 8 ---
Original File: adoquin_atras

## Asfalto

In [3]:
class_name = 'asfalto'

subdirectories = []
subdirectories = find_subdirectories_os(rf'c:\Users\USER\Desktop\Everything\{class_name}')

for subdir in subdirectories:
    df_shape = []
    good_frames = []
    time_stamp = []
    dataframes_dict = {}
    file_names = {} 

    carpeta = rf'c:\Users\USER\Desktop\Everything\{class_name}\{subdir}'
    output_dir = rf'C:\\Users\\USER\\PEF\\Terrain-Traversability-Analysis\\Preprocessing_10\\{class_name}'

    csv_files = glob.glob(os.path.join(carpeta, '**', '*.csv'), recursive=True)

    folder_name = os.path.basename(carpeta.rstrip('\\').rstrip('_'))
    print("="*40)
    print(f"\nFolder Name: {folder_name}")
    print(f"\nFound a total of {len(csv_files)} CSV files in the directory.")

    for i in range(len(csv_files)):
        df = pl.read_csv(csv_files[i], skip_rows=1)
        if is_good_frame(df):
            good_frames.append(i)
            std_val = df['# TIMESTAMP (ns)'].std()
            time_stamp.append([std_val, i])
            df_transformed = new_df(df)
            dataframes_dict[i] = df_transformed
            df_shape.append(df_transformed.shape)
            
            file_names[i] = os.path.basename(csv_files[i])

    print(f"Approved frames: {len(good_frames)}")

    os.makedirs(output_dir, exist_ok=True)

    if good_frames:
        if len(time_stamp) > 0:
            arr = np.array(time_stamp)
            
            if arr.ndim == 1:
                arr = arr.reshape(1, -1)
            
            sorted_arr = arr[arr[:, 0].argsort()[::-1]]
            num_frames = min(10, len(sorted_arr))
            top_10_indices = sorted_arr[:num_frames, 1].astype(int)
            
            top_10_dfs = {idx: dataframes_dict[idx] for idx in top_10_indices if idx in dataframes_dict}
            
            for rank, (idx, df) in enumerate(top_10_dfs.items(), 1):
                print(f'\n--- # {rank} ---')
                print(f'Original File: {file_names[idx]}')
                print(f'Points: {df.height}')
                
                ''' # Optional Visualization (Makes debugging easier)
                plt.figure(figsize=(6, 6))
                plt.scatter(df["X"], df["Y"], s=1, c=df["Z"], cmap="viridis")
                plt.xlabel("X (m)")
                plt.ylabel("Y (m)")
                plt.title(f"# {rank} - {file_names[idx]}\n(X vs Y) colored by Z")
                plt.colorbar(label="Height Z (m)")
                plt.xlim(-3, 3)
                plt.ylim(0, 3)
                plt.grid(True, alpha=0.3)
                plt.show()
                '''
                
                output_file = f'{output_dir}/{os.path.splitext(file_names[idx])[0]}.csv'
                df.write_csv(output_file)
                print(f'Saved as: {file_names[idx]}.csv')
            
            print(f"Total: {len(top_10_dfs)} frames saved in {output_dir}")
            print("="*40)
        else:
            print("No timestamp data available to rank frames.")


Folder Name: asfalto_estacionamiento_biblio

Found a total of 75 CSV files in the directory.
Approved frames: 73

--- # 1 ---
Original File: asfalto_estacionamiento_biblio_57.csv
Points: 20964
Saved as: asfalto_estacionamiento_biblio_57.csv.csv

--- # 2 ---
Original File: asfalto_estacionamiento_biblio_15.csv
Points: 20980
Saved as: asfalto_estacionamiento_biblio_15.csv.csv

--- # 3 ---
Original File: asfalto_estacionamiento_biblio_28.csv
Points: 20985
Saved as: asfalto_estacionamiento_biblio_28.csv.csv

--- # 4 ---
Original File: asfalto_estacionamiento_biblio_69.csv
Points: 20954
Saved as: asfalto_estacionamiento_biblio_69.csv.csv

--- # 5 ---
Original File: asfalto_estacionamiento_biblio_43.csv
Points: 20974
Saved as: asfalto_estacionamiento_biblio_43.csv.csv

--- # 6 ---
Original File: asfalto_estacionamiento_biblio_39.csv
Points: 20966
Saved as: asfalto_estacionamiento_biblio_39.csv.csv

--- # 7 ---
Original File: asfalto_estacionamiento_biblio_24.csv
Points: 20977
Saved as: asfa

## Concreto

In [None]:
class_name = 'concreto'

subdirectories = []
subdirectories = find_subdirectories_os(rf'c:\Users\USER\Desktop\Everything\{class_name}')

for subdir in subdirectories:
    df_shape = []
    good_frames = []
    time_stamp = []
    dataframes_dict = {}
    file_names = {} 

    carpeta = rf'c:\Users\USER\Desktop\Everything\{class_name}\{subdir}'
    output_dir = rf'C:\\Users\\USER\\PEF\\Terrain-Traversability-Analysis\\Preprocessing_10\\{class_name}'

    csv_files = glob.glob(os.path.join(carpeta, '**', '*.csv'), recursive=True)

    folder_name = os.path.basename(carpeta.rstrip('\\').rstrip('_'))
    print("="*40)
    print(f"\nFolder Name: {folder_name}")
    print(f"\nFound a total of {len(csv_files)} CSV files in the directory.")

    for i in range(len(csv_files)):
        df = pl.read_csv(csv_files[i], skip_rows=1)
        if is_good_frame(df):
            good_frames.append(i)
            std_val = df['# TIMESTAMP (ns)'].std()
            time_stamp.append([std_val, i])
            df_transformed = new_df(df)
            dataframes_dict[i] = df_transformed
            df_shape.append(df_transformed.shape)
            
            file_names[i] = os.path.basename(csv_files[i])

    print(f"Approved frames: {len(good_frames)}")

    os.makedirs(output_dir, exist_ok=True)

    if good_frames:
        if len(time_stamp) > 0:
            arr = np.array(time_stamp)
            
            if arr.ndim == 1:
                arr = arr.reshape(1, -1)
            
            sorted_arr = arr[arr[:, 0].argsort()[::-1]]
            num_frames = min(10, len(sorted_arr))
            top_10_indices = sorted_arr[:num_frames, 1].astype(int)
            
            top_10_dfs = {idx: dataframes_dict[idx] for idx in top_10_indices if idx in dataframes_dict}
            
            for rank, (idx, df) in enumerate(top_10_dfs.items(), 1):
                print(f'\n--- # {rank} ---')
                print(f'Original File: {file_names[idx]}')
                print(f'Points: {df.height}')
                
                ''' # Optional Visualization (Makes debugging easier)
                plt.figure(figsize=(6, 6))
                plt.scatter(df["X"], df["Y"], s=1, c=df["Z"], cmap="viridis")
                plt.xlabel("X (m)")
                plt.ylabel("Y (m)")
                plt.title(f"# {rank} - {file_names[idx]}\n(X vs Y) colored by Z")
                plt.colorbar(label="Height Z (m)")
                plt.xlim(-3, 3)
                plt.ylim(0, 3)
                plt.grid(True, alpha=0.3)
                plt.show()
                '''
                
                output_file = f'{output_dir}/{os.path.splitext(file_names[idx])[0]}.csv'
                df.write_csv(output_file)
                print(f'Saved as: {file_names[idx]}')
            
            print(f"Total: {len(top_10_dfs)} frames saved in {output_dir}")
            print("="*40)
        else:
            print("No timestamp data available to rank frames.")


Folder Name: concreto_ccu

Found a total of 71 CSV files in the directory.
Approved frames: 71

--- # 1 ---
Original File: cemento_ccu_5.csv
Points: 21702
Saved as: cemento_ccu_5.csv.csv

--- # 2 ---
Original File: cemento_ccu_26.csv
Points: 21688
Saved as: cemento_ccu_26.csv.csv

--- # 3 ---
Original File: cemento_ccu_25.csv
Points: 21696
Saved as: cemento_ccu_25.csv.csv

--- # 4 ---
Original File: cemento_ccu_30.csv
Points: 21696
Saved as: cemento_ccu_30.csv.csv

--- # 5 ---
Original File: cemento_ccu_7.csv
Points: 21698
Saved as: cemento_ccu_7.csv.csv

--- # 6 ---
Original File: cemento_ccu_61.csv
Points: 21694
Saved as: cemento_ccu_61.csv.csv

--- # 7 ---
Original File: cemento_ccu_68.csv
Points: 21835
Saved as: cemento_ccu_68.csv.csv

--- # 8 ---
Original File: cemento_ccu_64.csv
Points: 21685
Saved as: cemento_ccu_64.csv.csv

--- # 9 ---
Original File: cemento_ccu_49.csv
Points: 21692
Saved as: cemento_ccu_49.csv.csv

--- # 10 ---
Original File: cemento_ccu_66.csv
Points: 21682


## Grava

In [5]:
class_name = 'grava'

subdirectories = []
subdirectories = find_subdirectories_os(rf'c:\Users\USER\Desktop\Everything\{class_name}')

for subdir in subdirectories:
    df_shape = []
    good_frames = []
    time_stamp = []
    dataframes_dict = {}
    file_names = {} 

    carpeta = rf'c:\Users\USER\Desktop\Everything\{class_name}\{subdir}'
    output_dir = rf'C:\\Users\\USER\\PEF\\Terrain-Traversability-Analysis\\Preprocessing_10\\{class_name}'

    csv_files = glob.glob(os.path.join(carpeta, '**', '*.csv'), recursive=True)

    folder_name = os.path.basename(carpeta.rstrip('\\').rstrip('_'))
    print("="*40)
    print(f"\nFolder Name: {folder_name}")
    print(f"\nFound a total of {len(csv_files)} CSV files in the directory.")

    for i in range(len(csv_files)):
        df = pl.read_csv(csv_files[i], skip_rows=1)
        if is_good_frame(df):
            good_frames.append(i)
            std_val = df['# TIMESTAMP (ns)'].std()
            time_stamp.append([std_val, i])
            df_transformed = new_df(df)
            dataframes_dict[i] = df_transformed
            df_shape.append(df_transformed.shape)
            
            file_names[i] = os.path.basename(csv_files[i])

    print(f"Approved frames: {len(good_frames)}")

    os.makedirs(output_dir, exist_ok=True)

    if good_frames:
        if len(time_stamp) > 0:
            arr = np.array(time_stamp)
            
            if arr.ndim == 1:
                arr = arr.reshape(1, -1)
            
            sorted_arr = arr[arr[:, 0].argsort()[::-1]]
            num_frames = min(10, len(sorted_arr))
            top_10_indices = sorted_arr[:num_frames, 1].astype(int)
            
            top_10_dfs = {idx: dataframes_dict[idx] for idx in top_10_indices if idx in dataframes_dict}
            
            for rank, (idx, df) in enumerate(top_10_dfs.items(), 1):
                print(f'\n--- # {rank} ---')
                print(f'Original File: {file_names[idx]}')
                print(f'Points: {df.height}')
                
                ''' # Optional Visualization (Makes debugging easier)
                plt.figure(figsize=(6, 6))
                plt.scatter(df["X"], df["Y"], s=1, c=df["Z"], cmap="viridis")
                plt.xlabel("X (m)")
                plt.ylabel("Y (m)")
                plt.title(f"# {rank} - {file_names[idx]}\n(X vs Y) colored by Z")
                plt.colorbar(label="Height Z (m)")
                plt.xlim(-3, 3)
                plt.ylim(0, 3)
                plt.grid(True, alpha=0.3)
                plt.show()
                '''
                
                output_file = f'{output_dir}/{os.path.splitext(file_names[idx])[0]}.csv'
                df.write_csv(output_file)
                print(f'Saved as: {file_names[idx]}.csv')
            
            print(f"Total: {len(top_10_dfs)} frames saved in {output_dir}")
            print("="*40)
        else:
            print("No timestamp data available to rank frames.")


Folder Name: grava_25deseptiembre

Found a total of 82 CSV files in the directory.
Approved frames: 70

--- # 1 ---
Original File: grava_25deseptiembre_33.csv
Points: 24483
Saved as: grava_25deseptiembre_33.csv.csv

--- # 2 ---
Original File: grava_25deseptiembre_44.csv
Points: 24472
Saved as: grava_25deseptiembre_44.csv.csv

--- # 3 ---
Original File: grava_25deseptiembre_35.csv
Points: 24479
Saved as: grava_25deseptiembre_35.csv.csv

--- # 4 ---
Original File: grava_25deseptiembre_53.csv
Points: 24483
Saved as: grava_25deseptiembre_53.csv.csv

--- # 5 ---
Original File: grava_25deseptiembre_12.csv
Points: 24476
Saved as: grava_25deseptiembre_12.csv.csv

--- # 6 ---
Original File: grava_25deseptiembre_7.csv
Points: 24480
Saved as: grava_25deseptiembre_7.csv.csv

--- # 7 ---
Original File: grava_25deseptiembre_74.csv
Points: 24477
Saved as: grava_25deseptiembre_74.csv.csv

--- # 8 ---
Original File: grava_25deseptiembre_40.csv
Points: 24479
Saved as: grava_25deseptiembre_40.csv.csv

-

## Pasto

In [6]:
class_name = 'pasto'

subdirectories = []
subdirectories = find_subdirectories_os(rf'c:\Users\USER\Desktop\Everything\{class_name}')

for subdir in subdirectories:
    df_shape = []
    good_frames = []
    time_stamp = []
    dataframes_dict = {}
    file_names = {} 

    carpeta = rf'c:\Users\USER\Desktop\Everything\{class_name}\{subdir}'
    output_dir = rf'C:\\Users\\USER\\PEF\\Terrain-Traversability-Analysis\\Preprocessing_10\\{class_name}'

    csv_files = glob.glob(os.path.join(carpeta, '**', '*.csv'), recursive=True)

    folder_name = os.path.basename(carpeta.rstrip('\\').rstrip('_'))
    print("="*40)
    print(f"\nFolder Name: {folder_name}")
    print(f"\nFound a total of {len(csv_files)} CSV files in the directory.")

    for i in range(len(csv_files)):
        df = pl.read_csv(csv_files[i], skip_rows=1)
        if is_good_frame(df):
            good_frames.append(i)
            std_val = df['# TIMESTAMP (ns)'].std()
            time_stamp.append([std_val, i])
            df_transformed = new_df(df)
            dataframes_dict[i] = df_transformed
            df_shape.append(df_transformed.shape)
            
            file_names[i] = os.path.basename(csv_files[i])

    print(f"Approved frames: {len(good_frames)}")

    os.makedirs(output_dir, exist_ok=True)

    if good_frames:
        if len(time_stamp) > 0:
            arr = np.array(time_stamp)
            
            if arr.ndim == 1:
                arr = arr.reshape(1, -1)
            
            sorted_arr = arr[arr[:, 0].argsort()[::-1]]
            num_frames = min(10, len(sorted_arr))
            top_10_indices = sorted_arr[:num_frames, 1].astype(int)
            
            top_10_dfs = {idx: dataframes_dict[idx] for idx in top_10_indices if idx in dataframes_dict}
            
            for rank, (idx, df) in enumerate(top_10_dfs.items(), 1):
                print(f'\n--- # {rank} ---')
                print(f'Original File: {file_names[idx]}')
                print(f'Points: {df.height}')
                
                ''' # Optional Visualization (Makes debugging easier)
                plt.figure(figsize=(6, 6))
                plt.scatter(df["X"], df["Y"], s=1, c=df["Z"], cmap="viridis")
                plt.xlabel("X (m)")
                plt.ylabel("Y (m)")
                plt.title(f"# {rank} - {file_names[idx]}\n(X vs Y) colored by Z")
                plt.colorbar(label="Height Z (m)")
                plt.xlim(-3, 3)
                plt.ylim(0, 3)
                plt.grid(True, alpha=0.3)
                plt.show()
                '''
                
                output_file = f'{output_dir}/{os.path.splitext(file_names[idx])[0]}.csv'
                df.write_csv(output_file)
                print(f'Saved as: {file_names[idx]}.csv')
            
            print(f"Total: {len(top_10_dfs)} frames saved in {output_dir}")
            print("="*40)
        else:
            print("No timestamp data available to rank frames.")


Folder Name: cancha_de_futbol_1

Found a total of 74 CSV files in the directory.
Approved frames: 72

--- # 1 ---
Original File: cancha_de_futbol_1_17.csv
Points: 21881
Saved as: cancha_de_futbol_1_17.csv.csv

--- # 2 ---
Original File: cancha_de_futbol_1_5.csv
Points: 21877
Saved as: cancha_de_futbol_1_5.csv.csv

--- # 3 ---
Original File: cancha_de_futbol_1_68.csv
Points: 21871
Saved as: cancha_de_futbol_1_68.csv.csv

--- # 4 ---
Original File: cancha_de_futbol_1_55.csv
Points: 21867
Saved as: cancha_de_futbol_1_55.csv.csv

--- # 5 ---
Original File: cancha_de_futbol_1_10.csv
Points: 21892
Saved as: cancha_de_futbol_1_10.csv.csv

--- # 6 ---
Original File: cancha_de_futbol_1_29.csv
Points: 21886
Saved as: cancha_de_futbol_1_29.csv.csv

--- # 7 ---
Original File: cancha_de_futbol_1_43.csv
Points: 21882
Saved as: cancha_de_futbol_1_43.csv.csv

--- # 8 ---
Original File: cancha_de_futbol_1_39.csv
Points: 21888
Saved as: cancha_de_futbol_1_39.csv.csv

--- # 9 ---
Original File: cancha_d