In [1]:
from google.colab import drive
import os

# Connexion au drive:
drive.mount('/content/drive') 

# Chemin du dossier contenant le projet: 
PATH = '/content/drive/My Drive/' + "SII_comportement_vol_ST"
# Chemin du dossier contenant les données: 
DATA_PATH = PATH + "/data/Dataset_V3" 
DATA_TRAIN_PATH  = DATA_PATH + "/Train_clean_datasets"
DATA_TEST_PATH = DATA_PATH + "/Test_clean_datasets"
CODE_PATH = PATH + "/code/utilities"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
os.chdir(CODE_PATH)
import pandas as pd
import numpy as np
import re

from set_path import DATA_PATH

In [16]:
def retrieve_scenario_id (path,files):
  '''
  This function retrieve the scenario id of each time series. 
  For example, if the first time serie name in files is: 
  "TimeSeries_Scenario_00063_ScenarioInstanceListTakeOffAndTurnV2_id_0003.h5"
  then, the scenario id will be 63. 

  Inputs:
  - files: file containing the list of files names 
  - data_path: where to pick up the file

  Ouputs:
  - scenario_id: list of scenario id. 
  '''
  os.chdir(path)
  file_names = pd.read_csv(files)

  # Extract the scenario id of each time series
  scenario_id = []
  for i in range(len(file_names)):
    row_i = file_names.iloc[i].values[0] #extract text in row i
    nb = [int(s) for s in re.findall(r'\d+', row_i)] #all numbers in row i
    scenario_id.append(int(nb[0])) #add the 1st number to the list
  
  return scenario_id

In [17]:
scenario_id_train = retrieve_scenario_id(DATA_TRAIN_PATH,"file_names.csv")
scenario_id_test = retrieve_scenario_id(DATA_TEST_PATH,"file_names.csv")

In [18]:
def retrieve_all_labels (path_labels,file_labels):
  '''
  This function retrieve the information of all labels (test and train).

  Inputs:
  - file_labels: file containing the labels' information.
  - path_labels: where to pick up the file.

  Outpus:
  - Labels: dataframe of labels information.
    * Index: 
      scenario_id: scenario id.
    * Columns:
      scenario_name_parti: name of the particular scenario,
      scenario_name: manoeuvre performed,
      error_name: error name (or None if no error) 
      error_time: error time in sec (or None if no error), 
      is_error: presence of an error (boolan).
  '''
  os.chdir(path_labels)
  with open(file_labels) as f:
      GroundTruth = f.readlines()

  #all information contained in GroundTruth
  Labels = {}
  for i in range(len(GroundTruth)):
    #label_i: all information contained row i of GroundTruth
    #(without spaces and commas)
    label_i = list(map(str.strip, GroundTruth[i].split(',')))
    #if the scenario has an error, error=1, otherwise error=0.
    error = 1*(label_i[3] != 'None') + 0
    Labels[int(label_i[0])] = label_i[1:] + [error]

  # Transform dict to dataframe
  dfLabels = pd.DataFrame(Labels).T
  dfLabels.rename(columns = {0:"scenario_name_parti",1:"scenario_name",
               2:"error_name",3:"error_time",4:"is_error"},inplace = True)
  dfLabels.index.names = ['scenario_id']
  return dfLabels

In [19]:
all_labels = retrieve_all_labels(DATA_PATH,'GroundTruth.txt')

In [20]:
train_labels = all_labels.loc[scenario_id_train]
train_labels.reset_index(inplace=True)
test_labels = all_labels.loc[scenario_id_test]
test_labels.reset_index(inplace=True)