# Preprocessing

In [1]:
import pandas as pd

df = pd.read_csv('synchronized_data_with_classes-H.csv', dtype={"timestamp": str, "Classe": str})
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
df['date'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S').dt.date  # Extraire la date sous format 'YYYY-MM-DD'
df.head()

Unnamed: 0,timestamp,x_left,y_left,z_left,x_right,y_right,z_right,Classe,date
0,2024-07-19 20:00:07,-241.647059,-100.941176,-79.843137,-247.571429,23.285714,57.428571,none,2024-07-19
1,2024-07-19 20:00:08,-245.2,-86.32,-76.24,-252.88,18.72,43.04,none,2024-07-19
2,2024-07-19 20:00:09,-248.235294,-80.470588,-71.45098,-244.24,16.24,76.56,none,2024-07-19
3,2024-07-19 20:00:10,-250.48,-89.28,-25.84,-238.0,17.76,92.0,none,2024-07-19
4,2024-07-19 20:00:11,-248.96,-94.16,-6.8,-235.918367,23.591837,93.469388,none,2024-07-19


In [29]:
## Calculate features

import numpy as np

# Calcul de la norme de l'accélération
def calculate_acc_norm(df):
    df['acc_norm'] = np.sqrt(df['x_left']**2 + df['y_left']**2 + df['z_left']**2)
    return df

# df = calculate_acc_norm(df)

# Fonction pour calculer les statistiques
def calculate_statistics(df, col):
    return {
        f'{col}_mean': df[col].mean(),
        f'{col}_std': df[col].std(),
        f'{col}_min': df[col].min(),
        f'{col}_max': df[col].max(),
        f'{col}_range': df[col].max() - df[col].min(),
        f'{col}_var': df[col].var(),
        f'{col}_skew': df[col].skew(),
    }

window_size = 2

# Fenêtrage et calcul des statistiques par fenêtre
def calculate_window_features(df, window_size):
    df = calculate_acc_norm(df)
    features = []
    for i in range(0, len(df) - window_size + 1, window_size):
        window = df.iloc[i:i + window_size]
        
        # Calcul des statistiques pour chaque colonne
        feature_x = calculate_statistics(window, 'x_left')
        feature_y = calculate_statistics(window, 'y_left')
        feature_z = calculate_statistics(window, 'z_left')
        feature_acc_norm = calculate_statistics(window, 'acc_norm')
        
        # Combiner toutes les caractéristiques
        feature = {}
        feature.update(feature_x)
        feature.update(feature_y)
        feature.update(feature_z)
        feature.update(feature_acc_norm)
        
        features.append(feature)
    
    return pd.DataFrame(features)

In [2]:
windowed_features = calculate_window_features(df, window_size)
windowed_features['timestamp'] = df['timestamp'].iloc[::window_size].reset_index(drop=True)
df_windowed = windowed_features.set_index('timestamp')

timestamp_to_class = df[['timestamp', 'Classe']].drop_duplicates().set_index('timestamp')['Classe'].to_dict()

df_windowed['Classe'] = df_windowed.index.map(timestamp_to_class)

# Afficher les premières lignes pour vérifier
df_windowed.head()

Unnamed: 0_level_0,x_left_mean,x_left_std,x_left_min,x_left_max,x_left_range,x_left_var,x_left_skew,y_left_mean,y_left_std,y_left_min,...,z_left_var,z_left_skew,acc_norm_mean,acc_norm_std,acc_norm_min,acc_norm_max,acc_norm_range,acc_norm_var,acc_norm_skew,Classe
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-07-19 20:00:07,-243.423529,2.512309,-245.2,-241.647059,3.552941,6.311696,,-93.630588,10.338733,-100.941176,...,6.491299,,272.341616,2.038987,270.899834,273.783397,2.883564,4.157469,,none
2024-07-19 20:00:09,-249.357647,1.587247,-250.48,-248.235294,2.244706,2.519352,,-84.875294,6.229195,-89.28,...,1040.180766,,268.863003,2.396795,267.168214,270.557793,3.389579,5.744624,,none
2024-07-19 20:00:11,-249.068235,0.153068,-249.176471,-248.96,0.216471,0.02343,,-92.844706,1.860107,-94.16,...,373.12692,,266.948529,0.976278,266.258196,267.638862,1.380666,0.953119,,none
2024-07-19 20:00:13,-255.921569,1.109187,-256.705882,-255.137255,1.568627,1.230296,,-67.411765,2.495671,-69.176471,...,44.290657,,268.141241,0.631639,267.694605,268.587877,0.893273,0.398968,,none
2024-07-19 20:00:15,-255.723137,0.052132,-255.76,-255.686275,0.073725,0.002718,,-69.69098,1.712585,-70.901961,...,134.634397,,267.183632,1.02221,266.46082,267.906443,1.445623,1.044913,,none


In [3]:
df_windowed['Classe'].value_counts()

Classe
none    193676
A        16620
P         3360
K         2700
S         2370
T         2280
H         1290
M          900
Name: count, dtype: int64

In [None]:
import numpy as np
import pandas as pd

# Convertir la colonne 'timestamp' en datetime et trier les données
df_windowed.reset_index(inplace=True)
df_windowed['timestamp'] = pd.to_datetime(df_windowed['timestamp'])
df_windowed = df_windowed.sort_values('timestamp')
df_windowed['date'] = df_windowed['timestamp'].dt.date

df_windowed.head()

In [None]:
df_train = df_windowed[df_windowed['date'] < pd.to_datetime('2024-07-22').date()]
df_test = df_windowed[df_windowed['date'] == pd.to_datetime('2024-07-22').date()]

df_train.drop(columns=['timestamp', 'date'], inplace=True)
df_test.drop(columns=['timestamp', 'date'], inplace=True)

In [7]:
df_test['Classe'].unique()

array(['none', 'T', 'S', 'H', 'A', 'P'], dtype=object)

In [15]:
left = pd.read_csv('2ia-2024-hackathon/left_accs.csv')
right = pd.read_csv('2ia-2024-hackathon/right_accs.csv')

left = left.iloc[::2]
right = right.iloc[::2]

left.columns = ['timestamp', 'x_left', 'y_left', 'z_left']
right.columns = ['timestamp', 'x_right', 'y_right', 'z_right']

left['timestamp'] = pd.to_datetime(left['timestamp'], unit='s')
right['timestamp'] = pd.to_datetime(right['timestamp'], unit='s')

Unnamed: 0,timestamp,x_left,y_left,z_left
0,2024-07-19 20:00:05.079999924,-224,-92,-60
2,2024-07-19 20:00:05.119999886,-240,-96,-72
4,2024-07-19 20:00:05.160000086,-248,-108,-80
6,2024-07-19 20:00:05.200000048,-236,-100,-64
8,2024-07-19 20:00:05.240000010,-244,-104,-68


In [20]:
df = pd.merge_asof(left.sort_values('timestamp'), 
                       right.sort_values('timestamp'), 
                       on='timestamp', 
                       direction='nearest')

# Vérifier les valeurs manquantes dues à des timestamps non alignés
df.interpolate(method='linear', inplace=True)
df['timestamp'] = df['timestamp'].dt.round('10ms')

df.head()


Unnamed: 0,timestamp,x_left,y_left,z_left,x_right,y_right,z_right
0,2024-07-19 20:00:05.080,-224,-92,-60,-240,20,56
1,2024-07-19 20:00:05.120,-240,-96,-72,-240,20,56
2,2024-07-19 20:00:05.160,-248,-108,-80,-240,20,56
3,2024-07-19 20:00:05.200,-236,-100,-64,-240,20,56
4,2024-07-19 20:00:05.240,-244,-104,-68,-240,20,56


In [24]:
from tqdm import tqdm

def calculate_window_features(df, window_size):
    features = []
    # Ajouter la barre de progression
    for i in tqdm(range(0, len(df) - window_size + 1, window_size), desc="Calcul des fenêtres"):
        window = df.iloc[i:i + window_size]
        feature = {}
        # Calcul des statistiques pour chaque axe et norme
        feature.update(calculate_statistics(window, 'x_left'))
        feature.update(calculate_statistics(window, 'y_left'))
        feature.update(calculate_statistics(window, 'z_left'))
        feature.update(calculate_statistics(window, 'x_right'))
        feature.update(calculate_statistics(window, 'y_right'))
        feature.update(calculate_statistics(window, 'z_right'))
        feature['start_time'] = window['timestamp'].iloc[0]
        features.append(feature)
    return pd.DataFrame(features)

In [23]:
# Application avec une taille de fenêtre
window_size = 25  # Exemple : 50 échantillons par fenêtre
features_df = calculate_window_features(df, window_size)

# Affichage des premières lignes des features
features_df.head()

Calcul des fenêtres: 100%|██████████| 655790/655790 [37:03<00:00, 294.92it/s]  


Unnamed: 0,x_left_mean,x_left_std,x_left_var,x_left_min,x_left_max,x_left_range,x_left_q1,x_left_q3,x_left_iqr,x_left_skew,...,z_right_max,z_right_range,z_right_q1,z_right_q3,z_right_iqr,z_right_skew,z_right_kurt,z_right_mad,start_time,end_time
0,-243.04,6.2482,39.04,-256,-224,32,-248.0,-240.0,8.0,0.920807,...,56,0,56.0,56.0,0.0,0.0,0.0,0.0,2024-07-19 20:00:05.080,2024-07-19 20:00:06.040
1,-243.36,3.946306,15.573333,-248,-236,12,-248.0,-240.0,8.0,0.503935,...,56,0,56.0,56.0,0.0,0.0,0.0,0.0,2024-07-19 20:00:06.080,2024-07-19 20:00:07.040
2,-241.76,10.650822,113.44,-264,-220,44,-248.0,-240.0,8.0,0.157713,...,80,44,56.0,56.0,0.0,0.646564,6.009315,3.6864,2024-07-19 20:00:07.080,2024-07-19 20:00:08.040
3,-245.6,12.543258,157.333333,-264,-212,52,-256.0,-240.0,16.0,0.914742,...,64,76,36.0,56.0,20.0,-1.372127,2.127885,14.272,2024-07-19 20:00:08.070,2024-07-19 20:00:09.030
4,-248.16,16.226727,263.306667,-284,-228,56,-256.0,-232.0,24.0,-0.664566,...,92,32,72.0,84.0,12.0,-0.518836,0.37222,5.696,2024-07-19 20:00:09.070,2024-07-19 20:00:10.020


In [24]:
features_df.to_csv('features.csv', index=False)

In [27]:
# rassembler les données par seconde

features_df['timestamp'] = features_df['start_time'].dt.round('1s')
features_df.drop(columns=['start_time', 'end_time'], inplace=True)

features_df.head()

Unnamed: 0,x_left_mean,x_left_std,x_left_var,x_left_min,x_left_max,x_left_range,x_left_q1,x_left_q3,x_left_iqr,x_left_skew,...,z_right_min,z_right_max,z_right_range,z_right_q1,z_right_q3,z_right_iqr,z_right_skew,z_right_kurt,z_right_mad,timestamp
0,-243.04,6.2482,39.04,-256,-224,32,-248.0,-240.0,8.0,0.920807,...,56,56,0,56.0,56.0,0.0,0.0,0.0,0.0,2024-07-19 20:00:05
1,-243.36,3.946306,15.573333,-248,-236,12,-248.0,-240.0,8.0,0.503935,...,56,56,0,56.0,56.0,0.0,0.0,0.0,0.0,2024-07-19 20:00:06
2,-241.76,10.650822,113.44,-264,-220,44,-248.0,-240.0,8.0,0.157713,...,36,80,44,56.0,56.0,0.0,0.646564,6.009315,3.6864,2024-07-19 20:00:07
3,-245.6,12.543258,157.333333,-264,-212,52,-256.0,-240.0,16.0,0.914742,...,-12,64,76,36.0,56.0,20.0,-1.372127,2.127885,14.272,2024-07-19 20:00:08
4,-248.16,16.226727,263.306667,-284,-228,56,-256.0,-232.0,24.0,-0.664566,...,60,92,32,72.0,84.0,12.0,-0.518836,0.37222,5.696,2024-07-19 20:00:09


In [28]:
features_df.to_csv('features_per_second.csv', index=False)

In [30]:
features = pd.read_csv('features_per_second.csv', index_col='timestamp')
features.head()

Unnamed: 0_level_0,x_left_mean,x_left_std,x_left_var,x_left_min,x_left_max,x_left_range,x_left_q1,x_left_q3,x_left_iqr,x_left_skew,...,z_right_var,z_right_min,z_right_max,z_right_range,z_right_q1,z_right_q3,z_right_iqr,z_right_skew,z_right_kurt,z_right_mad
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-07-19 20:00:05,-243.04,6.2482,39.04,-256,-224,32,-248.0,-240.0,8.0,0.920807,...,0.0,56,56,0,56.0,56.0,0.0,0.0,0.0,0.0
2024-07-19 20:00:06,-243.36,3.946306,15.573333,-248,-236,12,-248.0,-240.0,8.0,0.503935,...,0.0,56,56,0,56.0,56.0,0.0,0.0,0.0,0.0
2024-07-19 20:00:07,-241.76,10.650822,113.44,-264,-220,44,-248.0,-240.0,8.0,0.157713,...,53.76,36,80,44,56.0,56.0,0.0,0.646564,6.009315,3.6864
2024-07-19 20:00:08,-245.6,12.543258,157.333333,-264,-212,52,-256.0,-240.0,16.0,0.914742,...,360.64,-12,64,76,36.0,56.0,20.0,-1.372127,2.127885,14.272
2024-07-19 20:00:09,-248.16,16.226727,263.306667,-284,-228,56,-256.0,-232.0,24.0,-0.664566,...,52.16,60,92,32,72.0,84.0,12.0,-0.518836,0.37222,5.696


In [35]:
classes = pd.read_csv('synchronized_data_with_classes-H.csv', dtype={"timestamp": str, "Classe": str})
classes['timestamp'] = pd.to_datetime(classes['timestamp'], format='%Y-%m-%d %H:%M:%S')
features = features.reset_index()

# Convert 'timestamp' in features to datetime64[ns]
features['timestamp'] = pd.to_datetime(features['timestamp'])

# jointure entre features et classes sur timestamp
data = pd.merge(features, classes, on='timestamp', how='inner')

Unnamed: 0,index,timestamp,x_left_mean,x_left_std,x_left_var,x_left_min,x_left_max,x_left_range,x_left_q1,x_left_q3,...,z_right_skew,z_right_kurt,z_right_mad,x_left,y_left,z_left,x_right,y_right,z_right,Classe
0,2,2024-07-19 20:00:07,-241.76,10.650822,113.44,-264,-220,44,-248.0,-240.0,...,0.646564,6.009315,3.6864,-241.647059,-100.941176,-79.843137,-247.571429,23.285714,57.428571,none
1,3,2024-07-19 20:00:08,-245.6,12.543258,157.333333,-264,-212,52,-256.0,-240.0,...,-1.372127,2.127885,14.272,-245.2,-86.32,-76.24,-252.88,18.72,43.04,none
2,4,2024-07-19 20:00:09,-248.16,16.226727,263.306667,-284,-228,56,-256.0,-232.0,...,-0.518836,0.37222,5.696,-248.235294,-80.470588,-71.45098,-244.24,16.24,76.56,none
3,5,2024-07-19 20:00:10,-250.56,27.758002,770.506667,-292,-164,128,-264.0,-244.0,...,-0.345452,0.497594,5.632,-250.48,-89.28,-25.84,-238.0,17.76,92.0,none
4,6,2024-07-19 20:00:11,-249.44,11.423951,130.506667,-272,-216,56,-256.0,-248.0,...,0.498432,-0.348611,18.176,-248.96,-94.16,-6.8,-235.918367,23.591837,93.469388,none


In [36]:
data.tail()

Unnamed: 0,index,timestamp,x_left_mean,x_left_std,x_left_var,x_left_min,x_left_max,x_left_range,x_left_q1,x_left_q3,...,z_right_skew,z_right_kurt,z_right_mad,x_left,y_left,z_left,x_right,y_right,z_right,Classe
451743,451745,2024-07-24 23:59:55,144.8,1.632993,2.666667,144,148,4,144.0,144.0,...,1.296698,-0.353651,1.4592,144.784314,-28.54902,184.078431,-159.755102,27.346939,192.571429,none
451744,451746,2024-07-24 23:59:56,144.8,1.632993,2.666667,144,148,4,144.0,144.0,...,5.0,25.0,0.3072,145.490196,-28.235294,184.0,-159.836735,27.020408,192.489796,none
451745,451747,2024-07-24 23:59:57,145.76,2.026491,4.106667,144,148,4,144.0,148.0,...,2.490746,4.563421,0.8448,145.04,-28.72,184.08,-159.673469,27.428571,192.489796,none
451746,451748,2024-07-24 23:59:58,144.64,1.496663,2.24,144,148,4,144.0,144.0,...,1.597493,0.592885,1.28,145.28,-28.72,184.0,-159.836735,27.265306,192.979592,none
451747,451749,2024-07-24 23:59:59,145.28,1.904381,3.626667,144,148,4,144.0,148.0,...,1.597493,0.592885,1.28,145.176471,-28.784314,184.078431,-159.68,27.6,192.72,none


In [37]:
data.to_csv('data.csv', index=False)