In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split

In [None]:
INPUT_PATH = "Broken_terrains_datasets"

RANDOM_STATE = 42

In [None]:
pd.set_option('display.max_columns', None)

np.random.seed(RANDOM_STATE)
random_state = np.random.RandomState(RANDOM_STATE)

In [None]:
files = os.listdir(INPUT_PATH)
dfs = [pd.read_csv(os.path.join(INPUT_PATH, str(file) + ".txt"), decimal='.', sep=';') for file in range(1000)]

In [None]:
df = pd.concat(
    dfs,
    ignore_index=True
)

In [None]:
filtered_df=df[
    (df.X_C_Neighbor1!='undefined') 
    & (df.X_C_Neighbor2!='undefined')
    & (df.X_C_Neighbor3!='undefined') 
    & (df.Z_N!=0) 
    & (df.n1_zn!=0) 
    & (df.n2_zn!=0) 
    & (df.n3_zn!=0) 
    & (df.DOC<0.90)  
].reset_index(drop=True)

In [None]:
euclidean_n = ['EuclideanNeighbor1_N', 'EuclideanNeighbor2_N','EuclideanNeighbor3_N']
euclidean_d = ['EuclideanNeighbor1_D', 'EuclideanNeighbor2_D','EuclideanNeighbor3_D']
cosine_n = ['CosineNeighbor1_N', 'CosineNeighbor2_N','CosineNeighbor3_N']
cosine_d = ['CosineNeighbor1_D', 'CosineNeighbor2_D','CosineNeighbor3_D']
angle_n = ['AngleNeighbor1_N', 'AngleNeighbor2_N','AngleNeighbor3_N']
angle_d = ['AngleNeighbor1_D', 'AngleNeighbor2_D','AngleNeighbor3_D']

euclidean_n_sorted = ['Euclidean_N_Max', 'Euclidean_N_Min', 'Euclidean_N_Intermediate']
euclidean_d_sorted = ['Euclidean_D_Max', 'Euclidean_D_Min', 'Euclidean_D_Intermediate']
cosine_n_sorted = ['Cosine_N_Max', 'Cosine_N_Min', 'Cosine_N_Intermediate']
cosine_d_sorted = ['Cosine_D_Max', 'Cosine_D_Min', 'Cosine_D_Intermediate']
angle_n_sorted = ['Angle_N_Max', 'Angle_N_Min', 'Angle_N_Intermediate']
angle_d_sorted = ['Angle_D_Max', 'Angle_D_Min', 'Angle_D_Intermediate']

sorting_pairs = [
    (euclidean_n, euclidean_n_sorted),
    (euclidean_d, euclidean_d_sorted),
    (cosine_n, cosine_n_sorted),
    (cosine_d, cosine_d_sorted),
    (angle_n, angle_n_sorted),
    (angle_d, angle_d_sorted)
]

In [None]:
def sort_values(row: pd.Series, output_columns: list) -> pd.Series:
    """
    Sort Neighbor values in descending order and return a Series with max, intermediate, and min values.

    Parameters
    ----------
    row : pd.Series
        A pandas Series containing Neighbor values.
    output_columns : list
        A list of column names for the output Series.
        Maximum value, intermediate value, minimum value.

    Returns
    -------
    pd.Series
        A pandas Series with the maximum, intermediate, and minimum values.
    """
    max_val = row.max()
    min_val = row.min()
    remaining_val = row.sum() - max_val - min_val
    return pd.Series([max_val, min_val, remaining_val], index=output_columns)

In [None]:
sorted_dfs = [
    filtered_df[list(cols)].apply(sort_values, axis=1, output_columns=list(sorted_cols))
    for cols, sorted_cols in sorting_pairs
]

In [None]:
sorted_df=pd.concat([
    filtered_df[['X_N']],
    filtered_df[['Y_N']],
    filtered_df[['Z_N']],
    filtered_df[['X_D']],
    filtered_df[['Y_D']],
    filtered_df[['Z_D']],   
    *sorted_dfs,
    filtered_df[['File_number']],
    filtered_df[['Fault']]   
    ], 
    axis=1
)

In [None]:
df_for_downsampling = sorted_df.copy()
class_count_0, class_count_1 = df_for_downsampling['Fault'].value_counts()
class_0 = df_for_downsampling[df_for_downsampling['Fault'] == -1]
class_1 = df_for_downsampling[df_for_downsampling['Fault'] == 1]# print the shape of the class
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

class_0_under = class_0.sample(class_count_1, random_state=RANDOM_STATE)

undersampled_df = pd.concat([class_0_under, class_1], axis=0)

In [None]:
X = undersampled_df.drop(columns=['Fault', 'File_number'])
y = undersampled_df['Fault']
y[y == -1] = 0  # Change labels from -1, 1 to 0, 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [None]:
for col in X_train.columns:
    X_train[[col]].to_csv(f'raw_variables/X_train_{col}.csv', index=False, header=False)
    X_test[[col]].to_csv(f'raw_variables/X_test_{col}.csv', index=False, header=False)