In [19]:
import os

import numpy as np
import pandas as pd
from docx import Document
from scipy import stats
from sklearn.model_selection import  train_test_split
from statsmodels.stats.diagnostic import lilliefors

In [None]:
INPUT_PATH = "Broken_terrains_datasets"

ALPHA = 0.05

RANDOM_STATE = 42

In [21]:
pd.set_option('display.max_columns', None)

np.random.seed(RANDOM_STATE)
random_state = np.random.RandomState(RANDOM_STATE)

In [22]:
files = os.listdir(INPUT_PATH)
dfs = [pd.read_csv(os.path.join(INPUT_PATH, str(file) + ".txt"), decimal='.', sep=';') for file in range(1000)]

In [23]:
df = pd.concat(
    dfs,
    ignore_index=True
)

In [24]:
filtered_df=df[
    (df.X_C_Neighbor1!='undefined') 
    & (df.X_C_Neighbor2!='undefined')
    & (df.X_C_Neighbor3!='undefined') 
    & (df.Z_N!=0) 
    & (df.n1_zn!=0) 
    & (df.n2_zn!=0) 
    & (df.n3_zn!=0) 
    & (df.DOC<0.90)  
].reset_index(drop=True)

In [25]:
euclidean_n = ['EuclideanNeighbor1_N', 'EuclideanNeighbor2_N','EuclideanNeighbor3_N']
euclidean_d = ['EuclideanNeighbor1_D', 'EuclideanNeighbor2_D','EuclideanNeighbor3_D']
cosine_n = ['CosineNeighbor1_N', 'CosineNeighbor2_N','CosineNeighbor3_N']
cosine_d = ['CosineNeighbor1_D', 'CosineNeighbor2_D','CosineNeighbor3_D']
angle_n = ['AngleNeighbor1_N', 'AngleNeighbor2_N','AngleNeighbor3_N']
angle_d = ['AngleNeighbor1_D', 'AngleNeighbor2_D','AngleNeighbor3_D']

euclidean_n_sorted = ['Euclidean_N_Max', 'Euclidean_N_Min', 'Euclidean_N_Intermediate']
euclidean_d_sorted = ['Euclidean_D_Max', 'Euclidean_D_Min', 'Euclidean_D_Intermediate']
cosine_n_sorted = ['Cosine_N_Max', 'Cosine_N_Min', 'Cosine_N_Intermediate']
cosine_d_sorted = ['Cosine_D_Max', 'Cosine_D_Min', 'Cosine_D_Intermediate']
angle_n_sorted = ['Angle_N_Max', 'Angle_N_Min', 'Angle_N_Intermediate']
angle_d_sorted = ['Angle_D_Max', 'Angle_D_Min', 'Angle_D_Intermediate']

sorting_pairs = [
    (euclidean_n, euclidean_n_sorted),
    (euclidean_d, euclidean_d_sorted),
    (cosine_n, cosine_n_sorted),
    (cosine_d, cosine_d_sorted),
    (angle_n, angle_n_sorted),
    (angle_d, angle_d_sorted)
]

In [26]:
def sort_values(row: pd.Series, output_columns: list) -> pd.Series:
    """
    Sort Neighbor values in descending order and return a Series with max, intermediate, and min values.

    Parameters
    ----------
    row : pd.Series
        A pandas Series containing Neighbor values.
    output_columns : list
        A list of column names for the output Series.
        Maximum value, intermediate value, minimum value.

    Returns
    -------
    pd.Series
        A pandas Series with the maximum, intermediate, and minimum values.
    """
    max_val = row.max()
    min_val = row.min()
    remaining_val = row.sum() - max_val - min_val
    return pd.Series([max_val, min_val, remaining_val], index=output_columns)

In [27]:
sorted_dfs = [
    filtered_df[list(cols)].apply(sort_values, axis=1, output_columns=list(sorted_cols))
    for cols, sorted_cols in sorting_pairs
]

In [28]:
sorted_df=pd.concat([
    filtered_df[['X_N']],
    filtered_df[['Y_N']],
    filtered_df[['Z_N']],
    filtered_df[['X_D']],
    filtered_df[['Y_D']],
    filtered_df[['Z_D']],   
    *sorted_dfs,
    filtered_df[['File_number']],
    filtered_df[['Fault']]   
    ], 
    axis=1
)

In [29]:
df_for_downsampling = sorted_df.copy()
class_count_0, class_count_1 = df_for_downsampling['Fault'].value_counts()
class_0 = df_for_downsampling[df_for_downsampling['Fault'] == -1]
class_1 = df_for_downsampling[df_for_downsampling['Fault'] == 1]# print the shape of the class
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

class_0_under = class_0.sample(class_count_1, random_state=RANDOM_STATE)

undersampled_df = pd.concat([class_0_under, class_1], axis=0)


class 0: (132886, 26)
class 1: (12411, 26)


In [30]:
X = undersampled_df.drop(columns=['Fault', 'File_number'])
y = undersampled_df['Fault']
y[y == -1] = 0  # Change labels from -1, 1 to 0, 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y == -1] = 0  # Change labels from -1, 1 to 0, 1


In [55]:
def check_normality(df: pd.DataFrame, alpha: float = 0.05) -> dict[str, bool]:
    """
    Check the normality of the data using:
    - Lilliefors test
    - Shapiro-Wilk test
    - Jarque-Bera test

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing the data to be tested.
    alpha : float, optional
        Significance level for the tests (default is 0.05).

    Returns
    -------
    pd.DataFrame
        A DataFrame with the results of the normality tests.
    """
    result = {}

    for col in df.columns:
        data = df[col].values
        lilliefors_stat, lilliefors_p_value = lilliefors(data)
        shapiro_stat, shapiro_p_value = stats.shapiro(data)
        jarque_bera_stat, jarque_bera_p_value = stats.jarque_bera(data)

        result[col] = {
            "Lilliefors stat": lilliefors_stat,
            "Lilliefors p value": lilliefors_p_value,
            "Lilliefors normal": lilliefors_p_value > alpha,
            "Shapiro stat": shapiro_stat,
            "Shapiro p value": shapiro_p_value,
            "Shapiro normal": shapiro_p_value > alpha,
            "Jarque-Bera stat": jarque_bera_stat,
            "Jarque-Bera p value": jarque_bera_p_value,
            "Jarque-Bera normal": jarque_bera_p_value > alpha,
        }
    
    result_df = pd.DataFrame(result).T
    result_df = result_df.astype({
        "Lilliefors stat": "float",
        "Lilliefors p value": "float",
        "Lilliefors normal": "bool",
        "Shapiro stat": "float",
        "Shapiro p value": "float",
        "Shapiro normal": "bool",
        "Jarque-Bera stat": "float",
        "Jarque-Bera p value": "float",
        "Jarque-Bera normal": "bool",
    })
    float_cols = result_df.select_dtypes(include=['float']).columns
    result_df[float_cols] = result_df[float_cols].map(lambda x: f"{x:.4f}")
    result_df.index.name = "Feature"
    return result_df

In [56]:
datasets = {
    "X_train": pd.DataFrame(X_train, columns=undersampled_df.drop(columns=['Fault', 'File_number']).columns),
    "X_test": pd.DataFrame(X_test, columns=undersampled_df.drop(columns=['Fault', 'File_number']).columns),
}

In [57]:
normality_results = {}
for name, dataset in datasets.items():
    normality_results[name] = check_normality(dataset, alpha=ALPHA)

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [58]:
def check_variance(df_1: pd.DataFrame, df_1_name: str, df_2: pd.DataFrame, df_2_name: str, alpha: float = 0.05) -> pd.DataFrame:
    """
    Check the equality of variances between two DataFrames using:
    - Brown-Forsythe test

    Parameters
    ----------
    df_1 : pd.DataFrame
        The first input DataFrame.
    df_1_name : str
        The name of the first DataFrame.
    df_2 : pd.DataFrame
        The second input DataFrame.
    df_2_name : str
        The name of the second DataFrame.
    alpha : float, optional
        Significance level for the test (default is 0.05).

    Returns
    -------
    pd.DataFrame
        A DataFrame with the results of the variance equality tests.
    """
    result = {}

    for col in df_1.columns:
        data1 = df_1[col].values
        data2 = df_2[col].values
        brown_forsythe_stat, brown_forsythe_p_value = stats.levene(data1, data2, center="median")

        result[col] = {
            f"var {df_1_name}": np.var(data1, ddof=1),
            f"var {df_2_name}": np.var(data2, ddof=1),
            f"var diff {df_1_name} minus {df_2_name}": np.var(data1, ddof=1) - np.var(data2, ddof=1),
            "Brown-Forsythe stat": brown_forsythe_stat,
            "Brown-Forsythe p value": brown_forsythe_p_value,
            "Brown-Forsythe equal variance": brown_forsythe_p_value > alpha,
        }

    result_df = pd.DataFrame(result).T
    result_df = result_df.astype({
        f"var {df_1_name}": "float",
        f"var {df_2_name}": "float",
        f"var diff {df_1_name} minus {df_2_name}": "float",
        "Brown-Forsythe stat": "float",
        "Brown-Forsythe p value": "float",
        "Brown-Forsythe equal variance": "bool",
    })
    float_cols = result_df.select_dtypes(include=['float']).columns
    result_df[float_cols] = result_df[float_cols].map(lambda x: f"{x:.4f}")
    result_df.index.name = "Feature"
    return result_df

In [59]:
datasets_variance = {
    "X train vs X test": (datasets["X_train"], "X train", datasets["X_test"], "X test"),
}

In [60]:
variance_test_results = {}
for name, (df1, df1_name, df2, df2_name) in datasets_variance.items():
    variance_test_results[name] = check_variance(df1, df1_name, df2, df2_name, alpha=ALPHA)

In [61]:
def check_means(df_1: pd.DataFrame, df_1_name: str, df_2: pd.DataFrame, df_2_name: str, alpha: float = 0.05) -> pd.DataFrame:
    """
    Check the equality of means between two DataFrames using:
    - Student's t-test

    Parameters
    ----------
    df_1 : pd.DataFrame
        The first input DataFrame.
    df_1_name : str
        The name of the first DataFrame.
    df_2 : pd.DataFrame
        The second input DataFrame.
    df_2_name : str
        The name of the second DataFrame.
    alpha : float, optional
        Significance level for the tests (default is 0.05).

    Returns
    -------
    pd.DataFrame
        A DataFrame with the results of the mean equality tests.
    """
    result = {}

    for col in df_1.columns:
        data1 = df_1[col].values
        data2 = df_2[col].values

        t_stat, t_p_value = stats.ttest_ind(data1, data2, equal_var=True)

        result[col] = {
            f"mean {df_1_name}": np.mean(data1),
            f"mean {df_2_name}": np.mean(data2),
            f"mean diff {df_1_name} minus {df_2_name}": np.mean(data1) - np.mean(data2),
            "t stat": t_stat,
            "t p value": t_p_value,
            "equal means": t_p_value > alpha,
        }

    result_df = pd.DataFrame(result).T
    result_df = result_df.astype({
        f"mean {df_1_name}": "float",
        f"mean {df_2_name}": "float",
        f"mean diff {df_1_name} minus {df_2_name}": "float",
        "t stat": "float",
        "t p value": "float",
        "equal means": "bool",
    })

    float_cols = result_df.select_dtypes(include=['float']).columns
    result_df[float_cols] = result_df[float_cols].map(lambda x: f"{x:.4f}")
    result_df.index.name = "Feature"
    return result_df

In [62]:
mean_test_results = {}
for name, (df1, df1_name, df2, df2_name) in datasets_variance.items():
    mean_test_results[name] = check_means(df1, df1_name, df2, df2_name, alpha=ALPHA)

In [None]:
def add_df_to_doc(doc: Document, df: pd.DataFrame, title: str):
    """
    Add a heading and a table to the Word document based on the DataFrame.

    Parameters
    ----------
    doc : Document
        The Word document object.
    df : pd.DataFrame
        The DataFrame to be added as a table.
        The index name of the DataFrame will be used as the first column header.
    title : str
        The title for the table.

    Returns
    -------
    None
    """
    doc.add_heading(title, level=2)
    table = doc.add_table(rows=df.shape[0] + 1, cols=df.shape[1] + 1)
    table.style = 'Table Grid'

    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = df.index.name
    
    run = hdr_cells[0].paragraphs[0].runs[0]
    run.font.bold = True

    for i, col_name in enumerate(df.columns):
        cell = hdr_cells[i + 1]
        cell.text = str(col_name)
        cell.paragraphs[0].runs[0].font.bold = True

    for i, row_idx in enumerate(df.index):
        row_cells = table.rows[i + 1].cells
        row_cells[0].text = str(row_idx)

        for j, col_name in enumerate(df.columns):
            val = df.loc[row_idx, col_name]
            row_cells[j + 1].text = str(val)

    doc.add_page_break()

tables_to_print = [
    (f"Normality Test Results (alpha={ALPHA}) - {name}", normality_results[name])
    for name in normality_results
]

tables_to_print += [
    (f"Variance Equality Test Results (alpha={ALPHA}) - {name}", variance_test_results[name])
    for name in variance_test_results
]

tables_to_print += [
    (f"Mean Equality Test Results (alpha={ALPHA}) - {name}", mean_test_results[name])
    for name in mean_test_results
]

doc = Document()
doc.add_heading('Statistics', 0)

for title, df in tables_to_print:
    add_df_to_doc(doc, df, title)

output_filename = 'Parameters.docx'
doc.save(output_filename)