In [None]:
import os 
import pandas as pd
import numpy as np

os.chdir(r"C:\Users\attou\OneDrive\Desktop\ExtraSensory Brut")

# Features to extract
def extract_features(df, start, end):
    window = df[start:end]

    stats = {
        "mean": window.mean(),
        "std": window.std(),
        "min": window.min(),
        "max": window.max(),
        "median": window.median(),
        "iqr": window.quantile(0.75) - window.quantile(0.25),
        "range": window.max() - window.min(),
        "mad":window.apply(lambda x: (x - x.mean()).abs().mean()),
        "skew": window.skew(),
        "kurtosis": window.kurtosis(),
        "energy": (window ** 2).sum(),
        "rms": (window ** 2).mean().pow(0.5),
        "variance": window.var(),
        "max_latency": window.idxmax(),
        "min_latency": window.idxmin(),
        "third_moment": window.apply(lambda x: ((x - x.mean())**3).mean()),
        "fourth_moment": window.apply(lambda x: ((x - x.mean())**4).mean()),
        "mean_abs_first_diff": window.apply(lambda x: x.diff().abs().mean()),
        "mean_abs_second_diff": window.apply(lambda x: x.diff().diff().abs().mean()),
        "percentile_25": window.quantile(0.25),
        "percentile_75": window.quantile(0.75),
        "value_entropy": window.apply(lambda x: -sum(p * np.log2(p) for p in x.value_counts(normalize=True) if p > 0))
    }

    features = pd.concat(stats.values(), axis=0).to_frame().T
    features.columns = [f"{stat}_{col}" for stat in stats for col in window.columns]
    return features

# Extract files names
folder = r"C:\Users\attou\OneDrive\Desktop\ExtraSensory Brut\Accelerometer"
dirs = [d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d))]

# Windows size of 20 s

In [None]:
window_size = 40 * 20 # 40hz * 20s

In [13]:
for user in dirs:
    print(user)
    ###############################################    
    path = os.path.join("Accelerometer", user)
    file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    all_dfs = []

    for file in file_names:
        df = pd.read_csv(os.path.join(path, file), sep=' ', header=None)
        df.drop(0, axis=1, inplace=True)
        df.columns = ['X', 'Y', 'Z']

        file_dfs = []
        for start in range(0, len(df) - window_size + 1, window_size):
            end = start + window_size
            features = extract_features(df, start, end)
            file_dfs.append(features)

        df_features = pd.concat(file_dfs, ignore_index=True)
        df_features['timestamp'] = file.split('.')[0]
        all_dfs.append(df_features)

    data_ACC = pd.concat(all_dfs, ignore_index=True)
    print(data_ACC.shape)
    ###############################################    
    ################################################
    
    path = os.path.join("Audio", user)
    file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    all_features = []
    all_timestamps = []

    for file in file_names:
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path, header=None)
        df.dropna(axis=1, inplace=True)

        X = df.values  # shape: (n_frames, 13)

        # Statistiques : mean, std, min, max
        stats = [
            X.mean(axis=0),
            X.std(axis=0),
            X.min(axis=0),
            X.max(axis=0)
        ]
        features = np.concatenate(stats)

        all_features.append(features)
        all_timestamps.append(file.split('.')[0])

    # Création des noms de colonnes
    columns = []
    for stat in ['mean', 'std', 'min', 'max']:
        columns += [f"mfcc{i}_{stat}" for i in range(X.shape[1])]

    # Création du DataFrame
    data_audio = pd.DataFrame(all_features, columns=columns)
    data_audio['timestamp'] = all_timestamps
    print(data_audio.shape)
    ##############################################################################
    
    location_path = "Location"
    filename = user + ".absolute_locations.csv.gz"  

    full_path = os.path.join(location_path, filename)

    data_loc = pd.read_csv(full_path, compression='gzip')
    
    print(data_loc.shape)
    ##############################################################################
    
    location_path = r"Labels"
    filename = user + ".original_labels.csv.gz"
    full_path = os.path.join(location_path, filename)


    data_label = pd.read_csv(full_path, compression='gzip')
    cols_to_drop = [col for col in data_label.columns if 'PHONE' in col.upper()]
    data_label = data_label.drop(columns=cols_to_drop)
    data_label.columns = data_label.columns.str.replace('^original_label:', '', regex=True)
    data_label.drop("label_source", axis= 1, inplace=True)
    print(data_label.shape)
    
    
    ####################################################################################
    ####################################################################################
    ####################################FUSION##########################################
    data_merged = pd.merge(data_ACC, data_audio, on='timestamp', how='left')
    
    data_merged['timestamp'] = data_merged['timestamp'].astype(int)
    data_loc['timestamp'] = data_loc['timestamp'].astype(int)
    data_final = pd.merge(data_merged, data_loc, on='timestamp', how='left')
    
    data_final_2 = pd.merge(data_final, data_label, on='timestamp', how='left')
    
    data_final_2.to_csv(user+'__20s.csv')
    ##################################################################################
    print("*"*80)

00EABED2-271D-49D8-B599-1D4A09240601
(2287, 67)
(2228, 53)
(2287, 3)
(2287, 105)
********************************************************************************
098A72A5-E3E5-4F54-A152-BBDA0DF7B694
(6808, 67)
(6735, 53)
(6813, 3)
(6813, 105)
********************************************************************************
0A986513-7828-4D53-AA1F-E02D6DF9561B
(3960, 67)
(3956, 53)
(3960, 3)
(3960, 105)
********************************************************************************
0BFC35E2-4817-4865-BFA7-764742302A2D
(3090, 67)
(3086, 53)
(3108, 3)
(3108, 105)
********************************************************************************
0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
(7513, 67)
(7510, 53)
(7521, 3)
(7521, 105)
********************************************************************************
1155FF54-63D3-4AB2-9863-8385D0BD0A13
(2685, 67)
(2674, 53)
(2685, 3)
(2685, 105)
********************************************************************************
11B5EC4D-4133-4289-B475-4E73

# Windows size of 10s

In [14]:
window_size = 40 * 10

In [15]:
for user in dirs:
    print(user)
    ###############################################    
    path = os.path.join("Accelerometer", user)
    file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    all_dfs = []

    for file in file_names:
        df = pd.read_csv(os.path.join(path, file), sep=' ', header=None)
        df.drop(0, axis=1, inplace=True)
        df.columns = ['X', 'Y', 'Z']

        file_dfs = []
        for start in range(0, len(df) - window_size + 1, window_size):
            end = start + window_size
            features = extract_features(df, start, end)
            file_dfs.append(features)

        df_features = pd.concat(file_dfs, ignore_index=True)
        df_features['timestamp'] = file.split('.')[0]
        all_dfs.append(df_features)

    data_ACC = pd.concat(all_dfs, ignore_index=True)
    print(data_ACC.shape)
    ###############################################    
    ################################################
    
    path = os.path.join("Audio", user)
    file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    all_features = []
    all_timestamps = []

    for file in file_names:
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path, header=None)
        df.dropna(axis=1, inplace=True)

        X = df.values  # shape: (n_frames, 13)

        # Statistiques : mean, std, min, max
        stats = [
            X.mean(axis=0),
            X.std(axis=0),
            X.min(axis=0),
            X.max(axis=0)
        ]
        features = np.concatenate(stats)

        all_features.append(features)
        all_timestamps.append(file.split('.')[0])

    # Création des noms de colonnes
    columns = []
    for stat in ['mean', 'std', 'min', 'max']:
        columns += [f"mfcc{i}_{stat}" for i in range(X.shape[1])]

    # Création du DataFrame
    data_audio = pd.DataFrame(all_features, columns=columns)
    data_audio['timestamp'] = all_timestamps
    print(data_audio.shape)
    ##############################################################################
    
    location_path = "Location"
    filename = user + ".absolute_locations.csv.gz"  

    full_path = os.path.join(location_path, filename)

    data_loc = pd.read_csv(full_path, compression='gzip')
    
    print(data_loc.shape)
    ##############################################################################
    
    location_path = r"Labels"
    filename = user + ".original_labels.csv.gz"
    full_path = os.path.join(location_path, filename)


    data_label = pd.read_csv(full_path, compression='gzip')
    cols_to_drop = [col for col in data_label.columns if 'PHONE' in col.upper()]
    data_label = data_label.drop(columns=cols_to_drop)
    data_label.columns = data_label.columns.str.replace('^original_label:', '', regex=True)
    data_label.drop("label_source", axis= 1, inplace=True)
    print(data_label.shape)
    
    
    ####################################################################################
    ####################################################################################
    ####################################FUSION##########################################
    data_merged = pd.merge(data_ACC, data_audio, on='timestamp', how='left')
    
    data_merged['timestamp'] = data_merged['timestamp'].astype(int)
    data_loc['timestamp'] = data_loc['timestamp'].astype(int)
    data_final = pd.merge(data_merged, data_loc, on='timestamp', how='left')
    
    data_final_2 = pd.merge(data_final, data_label, on='timestamp', how='left')
    
    data_final_2.to_csv(user+'__10s.csv')
    ##################################################################################
    print("*"*80)

00EABED2-271D-49D8-B599-1D4A09240601
(4574, 67)
(2228, 53)
(2287, 3)
(2287, 105)
********************************************************************************
098A72A5-E3E5-4F54-A152-BBDA0DF7B694
(13616, 67)
(6735, 53)
(6813, 3)
(6813, 105)
********************************************************************************
0A986513-7828-4D53-AA1F-E02D6DF9561B
(7920, 67)
(3956, 53)
(3960, 3)
(3960, 105)
********************************************************************************
0BFC35E2-4817-4865-BFA7-764742302A2D
(6180, 67)
(3086, 53)
(3108, 3)
(3108, 105)
********************************************************************************
0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
(15026, 67)
(7510, 53)
(7521, 3)
(7521, 105)
********************************************************************************
1155FF54-63D3-4AB2-9863-8385D0BD0A13
(5370, 67)
(2674, 53)
(2685, 3)
(2685, 105)
********************************************************************************
11B5EC4D-4133-4289-B475-4E

# Windos Size of 5s

In [16]:
window_size = 40 * 5

In [17]:
for user in dirs:
    print(user)
    ###############################################    
    path = os.path.join("Accelerometer", user)
    file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    all_dfs = []

    for file in file_names:
        df = pd.read_csv(os.path.join(path, file), sep=' ', header=None)
        df.drop(0, axis=1, inplace=True)
        df.columns = ['X', 'Y', 'Z']

        file_dfs = []
        for start in range(0, len(df) - window_size + 1, window_size):
            end = start + window_size
            features = extract_features(df, start, end)
            file_dfs.append(features)

        df_features = pd.concat(file_dfs, ignore_index=True)
        df_features['timestamp'] = file.split('.')[0]
        all_dfs.append(df_features)

    data_ACC = pd.concat(all_dfs, ignore_index=True)
    print(data_ACC.shape)
    ###############################################    
    ################################################
    
    path = os.path.join("Audio", user)
    file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

    all_features = []
    all_timestamps = []

    for file in file_names:
        file_path = os.path.join(path, file)
        df = pd.read_csv(file_path, header=None)
        df.dropna(axis=1, inplace=True)

        X = df.values  # shape: (n_frames, 13)

        # Statistiques : mean, std, min, max
        stats = [
            X.mean(axis=0),
            X.std(axis=0),
            X.min(axis=0),
            X.max(axis=0)
        ]
        features = np.concatenate(stats)

        all_features.append(features)
        all_timestamps.append(file.split('.')[0])

    # Création des noms de colonnes
    columns = []
    for stat in ['mean', 'std', 'min', 'max']:
        columns += [f"mfcc{i}_{stat}" for i in range(X.shape[1])]

    # Création du DataFrame
    data_audio = pd.DataFrame(all_features, columns=columns)
    data_audio['timestamp'] = all_timestamps
    print(data_audio.shape)
    ##############################################################################
    
    location_path = "Location"
    filename = user + ".absolute_locations.csv.gz"  

    full_path = os.path.join(location_path, filename)

    data_loc = pd.read_csv(full_path, compression='gzip')
    
    print(data_loc.shape)
    ##############################################################################
    
    location_path = r"Labels"
    filename = user + ".original_labels.csv.gz"
    full_path = os.path.join(location_path, filename)


    data_label = pd.read_csv(full_path, compression='gzip')
    cols_to_drop = [col for col in data_label.columns if 'PHONE' in col.upper()]
    data_label = data_label.drop(columns=cols_to_drop)
    data_label.columns = data_label.columns.str.replace('^original_label:', '', regex=True)
    data_label.drop("label_source", axis= 1, inplace=True)
    print(data_label.shape)
    
    
    ####################################################################################
    ####################################################################################
    ####################################FUSION##########################################
    data_merged = pd.merge(data_ACC, data_audio, on='timestamp', how='left')
    
    data_merged['timestamp'] = data_merged['timestamp'].astype(int)
    data_loc['timestamp'] = data_loc['timestamp'].astype(int)
    data_final = pd.merge(data_merged, data_loc, on='timestamp', how='left')
    
    data_final_2 = pd.merge(data_final, data_label, on='timestamp', how='left')
    
    data_final_2.to_csv(user+'__5s.csv')
    ##################################################################################
    print("*"*80)

00EABED2-271D-49D8-B599-1D4A09240601
(9148, 67)
(2228, 53)
(2287, 3)
(2287, 105)
********************************************************************************
098A72A5-E3E5-4F54-A152-BBDA0DF7B694
(27232, 67)
(6735, 53)
(6813, 3)
(6813, 105)
********************************************************************************
0A986513-7828-4D53-AA1F-E02D6DF9561B
(15840, 67)
(3956, 53)
(3960, 3)
(3960, 105)
********************************************************************************
0BFC35E2-4817-4865-BFA7-764742302A2D
(12360, 67)
(3086, 53)
(3108, 3)
(3108, 105)
********************************************************************************
0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
(30052, 67)
(7510, 53)
(7521, 3)
(7521, 105)
********************************************************************************
1155FF54-63D3-4AB2-9863-8385D0BD0A13
(10740, 67)
(2674, 53)
(2685, 3)
(2685, 105)
********************************************************************************
11B5EC4D-4133-4289-B475