<center><h1>Projet Robotique</h1></center>
<center><h2>Préparation des données</h2></center>

# Importation des librairies

In [15]:
import os 
import pandas as pd

from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Extraction des données

In [16]:
def get_index(string):
    """
    Helper function to extract the index from a line.
    get the index of the first alphabet in the string after the 9th index

    Args:
    line (str): Input line containing index information.

    Returns:
    int: Extracted index.
    """

    for i in range(9,len(string)):
        if string[i].isalpha():
            return i
    return -1

def extract_data():
    """
    Extracts data from .dat files in the OpportunityUCIDataset/dataset folder.

    Returns:
    pandas.DataFrame: Dataframe containing extracted data.
    """

    # Get all the .dat files in the dataset folder
    data_dir = 'OpportunityUCIDataset/dataset'
    files = os.listdir(data_dir)
    files = [f for f in files if f.endswith('.dat')]

    # Separate the ADL and Drill files
    list_of_files = [f for f in files if 'Drill' not in f]

    columns = []

    # Read column names from column_names.txt file
    with open(os.path.join(data_dir, "column_names.txt"), 'r') as f:
        lines = f.read().splitlines()

        for line in lines:
            if 'Column' in line:
                # Extract column names and append to the list
                columns.append(line[get_index(line):].split(";")[0])

    # Create an empty DataFrame with the extracted column names
    data_collection = pd.DataFrame(columns=columns)

    # Iterate over the list of files and concatenate data to the DataFrame
    for _, file in enumerate(list_of_files):
        proc_data = pd.read_table(os.path.join(data_dir, file), header=None, sep='\s+')
        proc_data.columns = columns
        data_collection = pd.concat([data_collection, proc_data])

    # Reset the DataFrame index
    data_collection.reset_index(drop=True, inplace=True)

    return data_collection

In [17]:
data_collection = extract_data()

data_collection

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,-95.0,1060.0,21.0,-257.0,997.0,-10.0,31.0,995.0,8.0,...,5898.0,2751.0,1480.0,0,0,0,0,0,0,0
1,33,-81.0,1057.0,10.0,-273.0,995.0,9.0,7.0,989.0,-9.0,...,5899.0,2748.0,1478.0,0,0,0,0,0,0,0
2,67,-92.0,1035.0,57.0,-294.0,985.0,0.0,26.0,998.0,-26.0,...,5900.0,2745.0,1476.0,0,0,0,0,0,0,0
3,100,-88.0,1032.0,92.0,-240.0,1014.0,32.0,7.0,995.0,-39.0,...,5902.0,2743.0,1475.0,0,0,0,0,0,0,0
4,133,-95.0,1027.0,87.0,-196.0,1002.0,38.0,38.0,1000.0,-96.0,...,5905.0,2740.0,1474.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644630,1386086,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644631,1386119,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644632,1386153,,,,,,,,,,...,,,,0,0,0,0,0,0,0
644633,1386186,,,,,,,,,,...,,,,0,0,0,0,0,0,0


In [18]:
data_collection["Locomotion"].value_counts()


Locomotion
1    222737
0    147439
2    130996
4    118068
5     25395
Name: count, dtype: int64

# Nettoyage des données

In [19]:
def data_cleaning(data_collection):
    """
    Performs data cleaning on the input DataFrame.

    Args:
    data_collection (pandas.DataFrame): Input DataFrame.

    Returns:
    pandas.DataFrame: Cleaned DataFrame.
    """

    # Drop columns with more than 10% NaN values
    threshold = int(len(data_collection.columns) * 0.9)
    data_collection = data_collection.dropna(thresh=threshold, inplace=False)

    # Convert non-numeric data to NaN
    data_collection = data_collection.apply(pd.to_numeric, errors='coerce')

    # Fill NaN values using an interpolation method
    data_collection = data_collection.interpolate()

    return data_collection

In [20]:
cleaned_dataCollection = data_cleaning(data_collection)

cleaned_dataCollection

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,-95.0,1060.0,21.0,-257.0,997.0,-10.0,31.0,995.0,8.0,...,5898.0,2751.0,1480.0,0,0,0,0,0,0,0
1,33,-81.0,1057.0,10.0,-273.0,995.0,9.0,7.0,989.0,-9.0,...,5899.0,2748.0,1478.0,0,0,0,0,0,0,0
2,67,-92.0,1035.0,57.0,-294.0,985.0,0.0,26.0,998.0,-26.0,...,5900.0,2745.0,1476.0,0,0,0,0,0,0,0
3,100,-88.0,1032.0,92.0,-240.0,1014.0,32.0,7.0,995.0,-39.0,...,5902.0,2743.0,1475.0,0,0,0,0,0,0,0
4,133,-95.0,1027.0,87.0,-196.0,1002.0,38.0,38.0,1000.0,-96.0,...,5905.0,2740.0,1474.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643791,1358120,157.0,994.0,47.0,121.0,792.0,250.0,323.0,901.0,316.0,...,5176.0,2517.0,1824.0,0,0,0,0,0,0,0
643792,1358153,157.0,994.0,47.0,98.0,599.0,296.0,323.0,931.0,326.0,...,5171.0,2568.0,1841.0,0,0,0,0,0,0,0
643793,1358186,157.0,994.0,47.0,-67.0,757.0,105.0,335.0,940.0,326.0,...,5168.0,2620.0,1858.0,0,0,0,0,0,0,0
643794,1358220,157.0,994.0,47.0,-183.0,1035.0,280.0,355.0,941.0,312.0,...,5164.0,2672.0,1874.0,0,0,0,0,0,0,0


# Encodage des labels

In [21]:
def reset_label(data_collection):
    """
    Resets labels in the given DataFrame based on the information from 'label_legend.txt'.

    Args:
    data_collection (pandas.DataFrame): Input DataFrame.

    Returns:
    pandas.DataFrame: DataFrame with reset labels.
    """

    # Read label_legend.txt file
    labels = pd.read_csv('OpportunityUCIDataset/dataset/label_legend.txt', sep='   -   ', header=0)

    # Create a dictionary to map track names to label indices
    track_dict = {}

    for track in labels['Track name'].unique():
        track_dict[track] = dict(labels.loc[labels['Track name'] == track][["Unique index", "Label name"]].to_numpy())

    # Special case for 'Locomotion' track
    for track in track_dict:
        if track == 'Locomotion':
            track_dict[track][1] = 1
            track_dict[track][2] = 2
            track_dict[track][4] = 3
            track_dict[track][5] = 4
        else:
            i = 0
            for key in track_dict[track]:
                track_dict[track][key] = i
                i += 1

    # Update labels in the DataFrame based on the mapping
    for track in track_dict:
        for key in track_dict[track]:
            data_collection.loc[data_collection[track] == key, track] = track_dict[track][key]

    return data_collection


In [22]:
df = reset_label(data_collection=cleaned_dataCollection)

df

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,0,-95.0,1060.0,21.0,-257.0,997.0,-10.0,31.0,995.0,8.0,...,5898.0,2751.0,1480.0,0,0,0,0,0,0,0
1,33,-81.0,1057.0,10.0,-273.0,995.0,9.0,7.0,989.0,-9.0,...,5899.0,2748.0,1478.0,0,0,0,0,0,0,0
2,67,-92.0,1035.0,57.0,-294.0,985.0,0.0,26.0,998.0,-26.0,...,5900.0,2745.0,1476.0,0,0,0,0,0,0,0
3,100,-88.0,1032.0,92.0,-240.0,1014.0,32.0,7.0,995.0,-39.0,...,5902.0,2743.0,1475.0,0,0,0,0,0,0,0
4,133,-95.0,1027.0,87.0,-196.0,1002.0,38.0,38.0,1000.0,-96.0,...,5905.0,2740.0,1474.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643791,1358120,157.0,994.0,47.0,121.0,792.0,250.0,323.0,901.0,316.0,...,5176.0,2517.0,1824.0,0,0,0,0,0,0,0
643792,1358153,157.0,994.0,47.0,98.0,599.0,296.0,323.0,931.0,326.0,...,5171.0,2568.0,1841.0,0,0,0,0,0,0,0
643793,1358186,157.0,994.0,47.0,-67.0,757.0,105.0,335.0,940.0,326.0,...,5168.0,2620.0,1858.0,0,0,0,0,0,0,0
643794,1358220,157.0,994.0,47.0,-183.0,1035.0,280.0,355.0,941.0,312.0,...,5164.0,2672.0,1874.0,0,0,0,0,0,0,0


# Normalisation des données 

In [23]:
# normalize the data using standard scaler
def normalize_data(df):
    """
    Normalize numeric columns in the DataFrame using StandardScaler.

    Args:
    df (pandas.DataFrame): Input DataFrame.

    Returns:
    pandas.DataFrame: DataFrame with normalized numeric columns.
    """
    
    scaler = StandardScaler()
    df[df.columns[:-7]] = scaler.fit_transform(df[df.columns[:-7]])
    return df

df = normalize_data(df)

df

Unnamed: 0,MILLISEC,Accelerometer RKN^ accX,Accelerometer RKN^ accY,Accelerometer RKN^ accZ,Accelerometer HIP accX,Accelerometer HIP accY,Accelerometer HIP accZ,Accelerometer LUA^ accX,Accelerometer LUA^ accY,Accelerometer LUA^ accZ,...,LOCATION TAG4 X,LOCATION TAG4 Y,LOCATION TAG4 Z,Locomotion,HL_Activity,LL_Left_Arm,LL_Left_Arm_Object,LL_Right_Arm,LL_Right_Arm_Object,ML_Both_Arms
0,-1.585284,-0.435310,0.413306,-0.780856,-0.292334,0.407904,-0.532002,-0.395708,0.681373,-0.924182,...,0.193924,0.668631,0.232962,0,0,0,0,0,0,0
1,-1.585185,-0.391732,0.408345,-0.804205,-0.351592,0.399306,-0.466147,-0.469487,0.659386,-0.974729,...,0.194510,0.665878,0.228794,0,0,0,0,0,0,0
2,-1.585083,-0.425972,0.371963,-0.704443,-0.429368,0.356318,-0.497341,-0.411079,0.692366,-1.025276,...,0.195096,0.663126,0.224626,0,0,0,0,0,0,0
3,-1.584985,-0.413521,0.367002,-0.630153,-0.229373,0.480984,-0.386428,-0.469487,0.681373,-1.063930,...,0.196269,0.661291,0.222543,0,0,0,0,0,0,0
4,-1.584886,-0.435310,0.358733,-0.640765,-0.066413,0.429398,-0.365632,-0.374190,0.699695,-1.233412,...,0.198027,0.658538,0.220459,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643791,2.481642,0.349083,0.304160,-0.725669,1.107637,-0.473354,0.369170,0.501927,0.336915,-0.008384,...,-0.229309,0.453943,0.949817,0,0,0,0,0,0,0
643792,2.481741,0.349083,0.304160,-0.725669,1.022453,-1.303025,0.528608,0.501927,0.446849,0.021349,...,-0.232240,0.500734,0.985243,0,0,0,0,0,0,0
643793,2.481840,0.349083,0.304160,-0.725669,0.411355,-0.623812,-0.133406,0.538817,0.479829,0.021349,...,-0.233999,0.548442,1.020669,0,0,0,0,0,0,0
643794,2.481941,0.349083,0.304160,-0.725669,-0.018266,0.571259,0.473152,0.600298,0.483493,-0.020278,...,-0.236344,0.596150,1.054011,0,0,0,0,0,0,0


# Sauvegarde du jeu de données

In [24]:
df.to_csv('preprocessed_data.csv', index=False)

In [25]:
df["Locomotion"].value_counts()

Locomotion
1    215037
0    128749
2    125809
3    112786
4     23667
Name: count, dtype: int64