In [1]:
import numpy as np
import scipy.stats as ss
import math
from scipy.fft import fft, fftfreq
from scipy import signal
import copy
import matplotlib.pyplot as plt
import json
import random

In [2]:
def formatSLE(WDEjson):
    '''
    Dobbiamo raggiungere questo formato:
    {
        "target": stride_lenght of stride index_inizio-index-fine,
        "Acc_X": [list of streams of stride index_inizio-index-fine],
        "Acc_Y": [list of streams of stride index_inizio-index-fine],
        "Acc_Z": [list of streams of stride index_inizio-index-fine],
        "Gyr_X": [list of streams of stride index_inizio-index-fine],
        "Gyr_Y": [list of streams of stride index_inizio-index-fine],
        "Gyr_Z": [list of streams of stride index_inizio-index-fine],
        "SensorTimestamp": [list of timestamp of stride index_inizio-index-fine]
    }
    Data_lable=["Acc_X","Acc_Y","Acc_Z","Gyr_X","Gyr_Y","Gyr_Z","SensorTimestamp"]    
    '''
    SLEjson={}
    SLEjson["target"]=WDEjson["stride_plength"]
    SLEjson["Acc_X"]=WDEjson["sensors"]["acc"]["acc_x"]
    SLEjson["Acc_Y"]=WDEjson["sensors"]["acc"]["acc_y"]
    SLEjson["Acc_Z"]=WDEjson["sensors"]["acc"]["acc_z"]
    SLEjson["Gyr_X"]=WDEjson["sensors"]["gyro"]["gyr_x"]
    SLEjson["Gyr_Y"]=WDEjson["sensors"]["gyro"]["gyr_y"]
    SLEjson["Gyr_Z"]=WDEjson["sensors"]["gyro"]["gyr_z"]
    SLEjson["SensorTimestamp"]=WDEjson["sensors"]["timestamp"]
    return SLEjson

In [3]:
def importWDE(data_path: str="C:/Users/aliba/OneDrive/Desktop/UNIVERSITA/TESI/DATASET/WalkingDistanceEstimation-master/dataset/"):
    filename_list_WDE=["PDR_Raw_2019-03-20-09-10-12","PDR_Raw_2019-03-20-09-21-02","PDR_Raw_2019-03-20-09-29-55","PDR_Raw_2019-03-21-08-32-39","PDR_Raw_2019-03-21-09-07-51","PDR_Raw_2019-03-21-11-57-56","PDR_Raw_2019-03-24-11-12-21","PDR_Raw_2019-03-28-11-50-11","PDR_Raw_2019-03-29-07-37-22","PDR_Raw_2019-03-29-08-30-54","PDR_Raw_2019-03-30-11-29-16","PDR_Raw_2019-03-31-01-23-59","PDR_Raw_2019-03-31-10-04-54","PDR_Raw_2019-03-31-10-33-25","PDR_Raw_2019-03-31-12-03-05","PDR_Raw_2019-03-31-12-29-51","PDR_Raw_2019-04-01-10-45-07","PDR_Raw_2019-04-02-08-44-50"]
    classi=['armhand', 'pocket', 'calling', 'swing', 'handheld']

    DATASET={'armhand':[], 'pocket':[], 'calling':[], 'swing':[], 'handheld':[]}#dict divided by mode, values are stride_list wich are list of dict (stride)
    stop_lists={'armhand':[0], 'pocket':[0], 'calling':[0], 'swing':[0], 'handheld':[0]}#dict divided by mode, values are index to split for subject
    for filename in filename_list_WDE:
        count_dict={'armhand':0, 'pocket':0, 'calling':0, 'swing':0, 'handheld':0}
        count_elim={'armhand':0, 'pocket':0, 'calling':0, 'swing':0, 'handheld':0}
        with open(data_path+filename+".txt",'r') as f:
            for line in f:
                stride=json.loads(line)
            
                #gli outliers non li carico direttamente
                if is_in_range(stride["stride_plength"],[1,2]) and is_in_range(len(stride["sensors"]["timestamp"]),[50,300]) :
                    if filename!="PDR_Raw_2019-03-31-10-33-25" or stride["mode"]!="calling":
                        DATASET[stride["mode"]].append(formatSLE(stride))
                        count_dict[stride["mode"]]+=1
                else:
                    count_elim[stride["mode"]]+=1
                
        print(filename,count_dict)
        for k,v in count_dict.items():
            if v!=0:
                stop_lists[k].append(stop_lists[k][-1]+v)
        print("Outliers eliminati\t",count_elim,"\n")

    print(f"\nIn totale=>",end=" ")
    s=0
    for k,v in DATASET.items():
        print(f"{k}:{len(v)}",end=", ")
        s+=len(v)
    print(f"-->{s} stride")
    
    return DATASET, stop_lists

In [4]:
def filtWDE(DATASET):
    print("Filtering:",end="")
    #buttfilter ogni lista di stride
    for mode,stride_list in DATASET.items():
        clean_stride_list=[]
        for i in range(len(stride_list)):
            if i==0 or i==len(stride_list)-1:
                print("#",end="")            
            
            clean_stride_list.append(SLE_buttfilter(stride_list[i]))
        #print(type(clean_stride_list),type(clean_stride_list[0]))
        DATASET[mode]=clean_stride_list
    print("\nDone!\n")
    return 0

In [5]:
def f_ext_WDE(DATASET):
    c=0
    #estraiamo le feature di ogni stride
    Feature_DATASET={'armhand':{'feature':[],'target':[]}, 'pocket':{'feature':[],'target':[]}, 'calling':{'feature':[],'target':[]}, 'swing':{'feature':[],'target':[]}, 'handheld':{'feature':[],'target':[]}}
    for mode,stride_list in DATASET.items():
        print(f"\nExtracting {mode}:",end="")
        c=0
        for stride in stride_list:
            c+=1
            Feature_DATASET[mode]['feature'].append(feature_extraction(stride))
            Feature_DATASET[mode]['target'].append(stride['target'])
            d=np.linspace(0,len(stride_list)-1,num=10,endpoint=True,dtype= int).tolist()
            if c in d:
                print("#",end="")
    return Feature_DATASET

In [6]:
def makeeqWDE(Feature_DATASET,n_train: int=500,n_test: int=50):
    #facciamolo equilibrato
    #train i primi 500 elem di ogni modalità, test gli ultimi 50 di ogni modalità
    DATASET_train={'armhand':{'feature':[],'target':[]}, 'pocket':{'feature':[],'target':[]}, 'calling':{'feature':[],'target':[]}, 'swing':{'feature':[],'target':[]}, 'handheld':{'feature':[],'target':[]}}
    DATASET_test={'armhand':{'feature':[],'target':[]}, 'pocket':{'feature':[],'target':[]}, 'calling':{'feature':[],'target':[]}, 'swing':{'feature':[],'target':[]}, 'handheld':{'feature':[],'target':[]}}

    for mode, diz in Feature_DATASET.items():
    
        DATASET_train[mode]['feature']=diz['feature'][0:n_train] 
        DATASET_train[mode]['target'] =diz['target'][0:n_train]

        DATASET_test[mode]['feature']=diz['feature'][-n_test:] 
        DATASET_test[mode]['target'] =diz['target'][-n_test:]
    return DATASET_train, DATASET_test

In [7]:
def trueeqWDE(Feature_DATASET,stop_lists,n_train: int=500,n_test: int=50):
    #facciamolo equilibrato
    #train i primi 500 elem di ogni modalità, test gli ultimi 50 di ogni modalità
    DATASET_train={'armhand':{'feature':[],'target':[]}, 'pocket':{'feature':[],'target':[]}, 'calling':{'feature':[],'target':[]}, 'swing':{'feature':[],'target':[]}, 'handheld':{'feature':[],'target':[]}}
    DATASET_test={'armhand':{'feature':[],'target':[]}, 'pocket':{'feature':[],'target':[]}, 'calling':{'feature':[],'target':[]}, 'swing':{'feature':[],'target':[]}, 'handheld':{'feature':[],'target':[]}}

    for mode, list_stop in stop_lists.items():
        
        #lista della lunghezza dei vari file in ordine
        l_s=[list_stop[i+1]-list_stop[i] for i in range(len(list_stop)-1)]
        
        #prendi il più piccolo e lo usi per il test
        #index of shortest file
        if min(l_s)<n_test:
            del l_s[l_s.index(min(l_s))]

        ind_ts=l_s.index(min(l_s))

        #print("test",ind_ts,list_stop[ind_ts+1]-n_test,list_stop[ind_ts+1])
        DATASET_test[mode]['feature']=Feature_DATASET[mode]['feature'][list_stop[ind_ts+1]-n_test:list_stop[ind_ts+1]] 
        DATASET_test[mode]['target'] =Feature_DATASET[mode]['target'][list_stop[ind_ts+1]-n_test:list_stop[ind_ts+1]]
        
        #i più grandi dividi il n_train per il numero 
        split_ntrain=int(n_train/(len(l_s)-1))
        
        n_file=0
        flag=0
        while len(DATASET_train[mode]['feature'])<n_train:
            
            count=flag
            if n_file==ind_ts:
                n_file+=1
                
            while (count<split_ntrain+flag and count!=l_s[n_file] and len(DATASET_train[mode]['feature'])!=n_train):
                
                #print(mode,count,n_file,count+list_stop[n_file])
                DATASET_train[mode]['feature'].append(Feature_DATASET[mode]['feature'][count+list_stop[n_file]]) 
                DATASET_train[mode]['target'].append(Feature_DATASET[mode]['target'][count+list_stop[n_file]])
                
                count+=1
            
            n_file+=1
            n_file=n_file%(len(l_s))
            if n_file==0:
                n_file=l_s.index(max(l_s))
                flag=split_ntrain

        
    return DATASET_train, DATASET_test

In [8]:
def full_regr_dataset(DATASET_train,DATASET_test):
    #funizone make_regr che unisce tutti i feature e tutti i target in una sola lista per avere F_x ed F_y della regression
    ####REGRESSION DATASET FOR INDISTINGUISHED MODE
    Regr_Dataset_train={"feature":[],"target":[]}
    Regr_Dataset_test={"feature":[],"target":[]}

    for k in DATASET_train.keys():
    
        Regr_Dataset_train["feature"]+=DATASET_train[k]["feature"]
        Regr_Dataset_train["target"]+=DATASET_train[k]["target"]
    
        Regr_Dataset_test["feature"]+=DATASET_test[k]["feature"]
        Regr_Dataset_test["target"]+=DATASET_test[k]["target"]
    
    return Regr_Dataset_train, Regr_Dataset_test

In [9]:
def single_regr_dataset(Feature_DATASET,label: str="handheld"):
    #funzione che seleziona solo una modalità

    #NON SERVE, PRENDI IL PUNTO DI SVILUPPO CHE TI INTERESSA E FAI DATASET[modalità] e te lo salvi dove cazzo vuoi
    #vediamo che l'ultimo file ha 664 handheld, quindi lo teniamo comme test isolando gli ultimi 664 stride
    ####REGRESSION DATA FOR SINGLE MODE
    Sreg_Dataset_train={"feature":[],"target":[]}
    Sreg_Dataset_test={"feature":[],"target":[]}

    if label=="handheld":
        ind=664#numero di elementi dell'ULTIMO soggetto
    elif label=="pocket":
        ind=180#numero di elementi dell'ULTIMO soggetto
    elif label=="calling":
        ind=123#numero di elementi dell'ULTIMO soggetto
    elif label=="swing":
        ind=232#numero di elementi dell'ULTIMO soggetto
    elif label=="armhand":
        ind=70#numero di elementi dell'ULTIMO soggetto
        
    Sreg_Dataset_train['feature']=Feature_DATASET[label]['feature'][0:-ind]
    Sreg_Dataset_train['target']=Feature_DATASET[label]['target'][0:-ind]

    Sreg_Dataset_test['feature']=Feature_DATASET[label]['feature'][-ind:]
    Sreg_Dataset_test['target']=Feature_DATASET[label]['target'][-ind:]
    
    return Sreg_Dataset_train, Sreg_Dataset_test

In [10]:
def classification_dataset(DATASET_train,DATASET_test):
    #funzione per il dataset della classificazione
    #prendi il dataset equilibrato e generi i target della classificazione Mode_list
    #DATASET_train e DATASET_test

    classification_feature_train=[]
    classification_target_train=[]

    classification_feature_test=[]
    classification_target_test=[]

    for mode in DATASET_train.keys():
        classification_feature_train+=DATASET_train[mode]['feature']
        n=len(DATASET_train[mode]['feature'])
        mode_list=[mode]*n
        classification_target_train+=mode_list
    
        classification_feature_test+=DATASET_test[mode]['feature']
        n=len(DATASET_test[mode]['feature'])
        mode_list=[mode]*n
        classification_target_test+=mode_list
        
    return classification_feature_train,classification_target_train,classification_feature_test,classification_target_test

In [11]:
def is_in_range(num,rng):
    return num <= rng[1] and num >= rng[0]
def del_outliers(Stride_list,mode_list: list=[] ,target_rng: list=[1,2],time_rng: list=[80,300]):
    
    clearedS=[]
    clearedM=[]
    for i in range(len(Stride_list)):
        diz=Stride_list[i]
        #don't save outliers
        if is_in_range(diz["target"],target_rng) and is_in_range(len(diz["Acc_X"]),time_rng) :#if true is in range
            clearedS.append(diz)
            if len(mode_list)!=0:
                clearedM.append(mode_list[i])
    
    return clearedS, clearedM

In [12]:
def random_list_range(lista,n_elem,randomize: bool= False):
    '''
    Return tuple: a rand_list of  randomize (random) n_elem elements of lista , left_list of leftovers
    if n_elem==-1 return lista,[] return full list with no leftovers
    Return [],lista if an error occurs
    '''
    #se la ista ha meno elementi di quelli che vogliamo selezionare allora ritorniamo una lista vuota
    if len(lista) < n_elem or n_elem < -1:
        return [], lista
    
    if n_elem == -1:
        return lista, []
    
    #se non vogliamo randomizzare prendiamo i primi n_elem elementi
    if  not randomize:
        return lista[0:n_elem], lista[n_elem:]

    #altrimenti prendiamo n_elem casuali dalla lista
    random_index_set=set([])
    while len(random_index_set) != n_elem:
        n=random.randint(0,len(lista)-1)
        random_index_set.add(n)
    
    rand_list=[]
    left_list=[]
    for i in range(len(lista)):  
        if i in random_index_set:
            rand_list.append(lista[i])
        else:
            left_list.append(lista[i])
        
    return rand_list, left_list

In [13]:
def make_eqdataset(full_x,full_y,list_stop,n_elem,randomize: bool=False):
    new_x=[]
    new_y=[]
    n_s=0
    
    start=0
    for stop in list_stop:
        subject_x=full_x[start:stop]
        subject_y=full_y[start:stop]
        start=stop
        
        x, _ = random_list_range(subject_x,n_elem,randomize= randomize)
        y, _ = random_list_range(subject_y,n_elem,randomize= randomize)
        
        if (len(x) == len(y)) and (len(x)>0):
            new_x+=x
            new_y+=y
            n_s+=1
        
    print(f"####\nDataset equilibrato, scelti {n_elem} stride per ogni soggetto.\nRisultato: {len(new_x)} stride per {n_s} subject.\n#####")
    return new_x, new_y
def filenames(filename_list,n_elem,stop_list):
    filt_filename=[]
    c=0
    for i in range(len(stop_list)):
        if (stop_list[i]-c)>=n_elem:
                filt_filename.append(filename_list[i])
        c=stop_list[i]
        
    return filt_filename

In [14]:
def return_plot_range(X,Y):
    return [min([min(X),min(Y)]),max([max(X),max(Y)])]

In [15]:
def ist_stride_lenght(vector_of_stride_lenght):
    print("Registered Stride lenght: Min: ",min(vector_of_stride_lenght),", Max: ", max(vector_of_stride_lenght))
    #bin width of 1cm
    bw=0.01
    plt.hist(vector_of_stride_lenght,bins=np.arange(min(vector_of_stride_lenght), max(vector_of_stride_lenght) + bw, bw))
    plt.xlim([min(vector_of_stride_lenght), max(vector_of_stride_lenght)])
    media=np.mean(np.array(vector_of_stride_lenght,dtype=float))
    st_d=np.std(np.array(vector_of_stride_lenght,dtype=float))
    print(f"Media= {str(media)}; Deviazione standard= {str(st_d)}.")
    plt.xlabel("Lunghezza dello stride/ mt")
    plt.ylabel("Numero di sample/ n")
    plt.show()
    return 0

In [16]:
def printSLE(dati_json,n_sample: int = -1):
    if n_sample > 0 :
        n_sample+=50
        
    time_stamp=[x-dati_json["SensorTimestamp"][0] for x in dati_json["SensorTimestamp"]]
    
    for k,v in dati_json.items():
        if (type(v) is list) and (k != "flag") and (k != "SensorTimestamp") and (k != "stride_number") and (k != "stride_length") and (k != "walking_distance"):
            plt.plot(time_stamp[50:n_sample],v[50:n_sample],color='r')
            plt.ylabel(k)
            plt.xlabel("Time \ ms")
            plt.ylim([min(v[50:n_sample]),max(v[50:n_sample])])
            plt.title("Grafico del "+ k)
            plt.figure()
            plt.show()
        elif (k == "stride_length"):
            plt.figure()
            ist_stride_lenght(v)
    return 0

In [17]:
def f_magnitude(dati_x,dati_y,dati_z):
    '''
    calcoliamo il vettore Magnitude da un vettore 3d
    '''
    M=[]
    for i in range(len(dati_x)):
        
        M.append(math.sqrt((dati_x[i]*dati_x[i])+(dati_y[i]*dati_y[i])+(dati_z[i]*dati_z[i])))


    return M

In [18]:
def error_rate(y_true,y_pred):
    '''
    Input= (N-1darray) true_value, (N-1darray) predicted value
    Output= error_rate % between the input arrays 
    '''
    count=0
    for i in range(len(y_true)):
        count+=(abs(y_pred[i]-y_true[i])/y_true[i])

    return (count/len(y_true))*100

In [19]:
def feature_extraction(data_ing):#calcoliamo ogni feature per ogni singolo stream ed anche la magnitude del sensore quindi f di A_x,y,z,||A||,G_x,y,z,||G||
    '''
    data_ing= json che rappresenta uno stride.
    
    f_Ax,..,f_Gm = < f_mean, f_std, f_ske, f_kurt, f_iqr, f_Ma, f_zc, f_A1, f_F1, f_A2, f_F2 > calcolato per < Acc_X, Acc_Y, Acc_Z, M_Acc, Gyr_X, Gyr_Y, Gyr_Z, M_Gyr > 
    
    OUTPUT=> ( 1d list) feature_vector =[f_Ax,f_Ay,f_Az,f_Gx,f_Gy,f_Gz,f_Am,f_Gm,f_CC,f_wei,f_kim,f_scar]
    
    '''
    
    
    data_json=copy.deepcopy(data_ing)#per non modificare data_ing

    data_json["M_Acc"]=f_magnitude(data_json["Acc_X"],data_json["Acc_Y"],data_json["Acc_Z"])
    data_json["M_Gyr"]=f_magnitude(data_json["Gyr_X"],data_json["Gyr_Y"],data_json["Gyr_Z"])
    
    #Data_lable=["Acc_X","Acc_Y","Acc_Z","Gyr_X","Gyr_Y","Gyr_Z","M_Acc","M_Gyr"]
    
    feature_vector=[]
    for lable, stream in data_json.items():#per ogni stream di misuraizoni
        if (type(stream) is list) and (lable != "SensorTimestamp"):
            #FEATURE STATISTICHE-----------------------------------
            f_mean=sum(stream)/len(stream)
            f_std=np.std(np.array(stream))  
            f_skew=ss.skew(stream)
            f_kurt=ss.kurtosis(stream)

            #calculate interquartile range 
            q3, q1 = np.percentile(np.array(stream), [75 ,25])
            f_iqr = q3 - q1

            #magnitude area= somma dei valori assoluti di un segnale
            stream_abs=[abs(x) for x in stream]
            f_Ma=sum(stream_abs)

            #FEATURE TEMPORALI-------------------------------------

            limite=max(stream)*95/100#la threshold per rilevare il picco mettiamo che sia il 95% del valore massimo
            f_peak=0
            #counting crossing zero srutto lo stesso for
            media_stream=np.mean(stream)
            f_zc=0
            for i in range(len(stream)):
                #peak
                if stream[i]>=limite:
                    f_peak+=1
                #zero crossing
                if i!=0:
                    if ( ((stream[i]-media_stream)*(stream[i-1]-media_stream)) < 0):
                        f_zc+=1


            #FEATURE FREQUENZIALI-------------------------------------
            #FFT of the stream without mean to find amplitude
            amplitude=np.abs(fft((stream-media_stream)))
            #troviamo le frequenze della serie temporale
            sample_rate= 100#frequenza di cmapionamento Hz
            N=len(amplitude)#number of sample n

            frequences= fftfreq(N, d= (1 / sample_rate))
            #take only positives
            frequences=[x for x in frequences if x >=0]
            #take amplitude
            amplitude=amplitude[0:len(frequences)]
            f_A1=np.amax(amplitude)
            f_F1=frequences[int((np.where(amplitude==f_A1)[0][0]))]
            amplitude[int((np.where(amplitude==f_A1)[0][0]))]=0
            f_A2=np.amax(amplitude)
            f_F2=frequences[int((np.where(amplitude==f_A2)[0][0]))]





            #creiamo il vettore feature_vector=[mean,std,skew,kurt,iqr,Ma,peak,zc |per ogni Acc_x,Acc_y...]
            feature_vector.append(f_mean)
            feature_vector.append(f_std)
            feature_vector.append(f_skew)
            feature_vector.append(f_kurt)
            feature_vector.append(f_iqr)
            feature_vector.append(f_Ma)
            #feature_vector.append(f_peak) #Feature non accurata
            feature_vector.append(f_zc)
            feature_vector.append(f_A1)
            feature_vector.append(f_F1)
            feature_vector.append(f_A2)
            feature_vector.append(f_F2)

    #FEATURE CROSS-SENSORI--------------------------------------
    #correlazione tra M_A ed M_G
    f_cc_AG = np.corrcoef(np.array(data_json["M_Acc"]),np.array(data_json["M_Gyr"]))[0,1]


    
    #FEATURE DI ALTO LIVELLO, PDR--------------------------------
    #Weinberg
    a_max=max(data_json["Acc_Z"])
    a_min=min(data_json["Acc_Z"])
    f_wei=math.sqrt(math.sqrt(a_max-a_min))


    #Kim
    M_A_abs=[abs(x) for x in data_json["M_Acc"]]
    sum_abs=sum(M_A_abs)
    f_kim=(sum_abs/len(data_json["M_Acc"]))**(1/3)


    #Scarlett

    f_sca=(sum_abs-max(data_json["M_Acc"]))/(max(data_json["M_Acc"])-min(data_json["M_Acc"]))


    #aggiungo le feature
    feature_vector.append(f_cc_AG)
    feature_vector.append(f_wei)
    feature_vector.append(f_kim)
    feature_vector.append(f_sca)
    
    return feature_vector

In [20]:
def remove_mean(data):
    '''
    Prende in input un segnale e lo restituisce senza media
    '''
    data_mean=sum(data)/len(data)
    return [x-data_mean for x in data]
def SLE_remove_mean(data_json):
    Data_lable=["Acc_X","Acc_Y","Acc_Z","Gyr_X","Gyr_Y","Gyr_Z"]
  
    for x in Data_lable:
        data_json[x]=remove_mean(data_json[x])

    return 0
def SLE_buttfilter(data_json,norder: int=1):
    '''
    Input = dato_json (con lable "Acc_X","Acc_Y","Acc_Z","Gyr_X","Gyr_Y","Gyr_Z" )
    Output= copia del json con le serie corrispondenti ai lable filtrate e castate a liste
    '''
    Data_lable=["Acc_X","Acc_Y","Acc_Z","Gyr_X","Gyr_Y","Gyr_Z"]#ho eliminato mag_x_y_z

    #design filter
    sos_filter=signal.butter(norder,3,'low',analog=False,output='sos',fs=100)
  
    data_filtered=copy.deepcopy(data_json)
    SLE_remove_mean(data_filtered)
    for x in Data_lable:
        #Filt signal
        data_filtered[x]=signal.sosfilt(sos_filter,data_json[x]).tolist()

    return data_filtered

In [21]:
def dividi(dati_json,index_inizio,index_fine):
    '''
    Input= dati json interi
    Output= frammento formattato dei dati iniziali
    Formato:
    {
        "target": stride_lenght of stride index_inizio-index-fine,
        "Acc_X": [list of streams of stride index_inizio-index-fine],
        "Acc_Y": [list of streams of stride index_inizio-index-fine],
        "Acc_Z": [list of streams of stride index_inizio-index-fine],
        "Gyr_X": [list of streams of stride index_inizio-index-fine],
        "Gyr_Y": [list of streams of stride index_inizio-index-fine],
        "Gyr_Z": [list of streams of stride index_inizio-index-fine],
        "SensorTimestamp": [list of timestamp of stride index_inizio-index-fine]
    }
    '''
    Data_lable=["Acc_X","Acc_Y","Acc_Z","Gyr_X","Gyr_Y","Gyr_Z","SensorTimestamp"]

    new_dict={}
    new_dict["target"]=dati_json["stride_length"][index_fine]
    for lable in Data_lable:
        new_dict[lable]=dati_json[lable][index_inizio+1:index_fine+1]


    return new_dict#ritorna il dizionario diviso per step

In [22]:
def segmentation(dati_json):
    '''
    Input= All json
    Output= list of dictionary (every dict is a stride)
    '''

    dati_segmentati=[]
    limits=(np.ediff1d(np.array([dati_json["stride_number"]],dtype=float)))
    limits=np.append(limits,1.)
    inizio=-1
    fine=-1
    for i in range(len(limits)):
        if (limits[i]>0):
            #inizio di uno è fine di quello prima (contatto a terra)
            inizio=fine
            fine=i
            dati_segmentati.append(dividi(dati_json,inizio,fine))

    if ((len(dati_segmentati))<1):
        print("Questi dati non possono essere segmentati")
        return {}

    #eliminiamo il primo stride che spesso è sfalsato
    dati_segmentati.pop(0)

    #assicuriamoci che siano tutte liste
    for diz in dati_segmentati:
        for k,v in diz.items():
            if (type(v) is np.ndarray):
                diz[k]=v.tolist()
    

    return dati_segmentati


def wdesplit(F_x_WDE,mode_list):
    '''
    Da F_x_WDE (lista dei vettori di feature) , F_y_WDE (lista dello stride lenght per ogni passo) e mode_list (lista dei lable)
    
    Restituisce la lista delle feature splittata per lable di feature e target
    
    '''
    classi=['armhand', 'pocket', 'calling', 'swing', 'handheld']
    split_dataset=[[],[],[],[],[]]
    for i in range(len(mode_list)):    
        split_dataset[classi.index(mode_list[i])].append(F_x_WDE[i])
        

    print(f"Per {len(split_dataset)} lable:")
    for i in range(len(classi)):
        print(f"\t{classi[i]}={len(split_dataset[i])}")
        
    return split_dataset



def eqWDE(split_dataset,n_elem: int=600,n_lft: int=85,randomize: bool=False):
    '''
    Da un dataset splittato per lable lo equilibra restituendo un vettore di vettori di feature equilibrato (n_elem per label)
    ed anche un vettore dei label corretti

    '''
    classi=['armhand', 'pocket', 'calling', 'swing', 'handheld']
    if n_elem>601:
        print("Fatal Error: il massimo dataset equilibrato è possibile con max 600 sample")
    
    
    F_x_eqWDE=[]
    left2=[]
    for ls in split_dataset:
        ls1, left1 =random_list_range(ls,n_elem,randomize= randomize)
        F_x_eqWDE.append(ls1)
        left2.append(left1)
    left_eqWDE=[]
    for l2 in left2:#equilibre also the leftovers, i want to take the LAST n_lft element, so i made the left to be n_lft
        tot_len=len(l2)
        _, left3 =random_list_range(l2,tot_len-n_lft,randomize=randomize)
        left_eqWDE.append(left3)
    
    F_x_eqWDE=np.array(F_x_eqWDE).reshape(-1,len(F_x_eqWDE[0][0]))
    left_eqWDE=np.array(left_eqWDE).reshape(-1,len(left_eqWDE[0][0]))
    
    eq_mode_list=[]
    left_mode_list=[]
    for cls in classi:
        tmp=[cls]*n_elem
        eq_mode_list+=tmp
        tmp=[cls]*n_lft
        left_mode_list+=tmp
    
    eq_mode_list=np.array(eq_mode_list)
    left_mode_list=np.array(left_mode_list)
    
    #print(f"Abbiamo ottenuto {F_x_eqWDE.shape[0]} stride di {F_x_eqWDE.shape[1]} feature. {n_elem} stride per {len(classi)} label")
    
    return F_x_eqWDE, eq_mode_list, left_eqWDE, left_mode_list



def genStride_list(dataset_path, filename_list, mode: str="handheld"):
    
    stop_list=[]
    Stride_list=[]
    mode_list=[]
    filt_filename=[]
    for filename in filename_list:
        print(f"--------Processing: {filename} --------")
        Stride_list1, mode_list1 =WDEselect(filename,dataset_path,mode)
        Stride_list+=Stride_list1
        mode_list+= mode_list1
        if len(Stride_list1) != 0 :
            stop_list.append(len(Stride_list))
            filt_filename.append(filename)
        
    print(f"\nTotale: {len(Stride_list)} stride")
    return Stride_list, stop_list, filt_filename, mode_list


def WDEselect(filename,dataset_path, mode: str="handheld"):
    '''
    Select from WDE dataset only "handheld" mode
    WDE dataset is segmented, every line is a json correspond to one stride
    
    in my PC dataset_path="C:/Users/aliba/OneDrive/Desktop/UNIVERSITA/TESI/DATASET/WalkingDistanceEstimation-master/dataset/"
    '''

    #import all data from file
    Stride_list=[]
    mode_list=[]
    stride={}
    no_handheld=0
    with open(dataset_path+filename+".txt",'r') as f:
        for line in f:
            stride=json.loads(line)
            if (stride["mode"] == mode) or (mode == "all"):
                Stride_list.append(formatSLE(stride))
                mode_list.append(stride["mode"])
            else:
                no_handheld+=1


    orig_len=len(Stride_list)
    print(f"Sono rimasti {orig_len} stride in mode {mode}; Sono stati eliminati {no_handheld} stride.")
    if orig_len != 0:
        #eliminate outliers
        Stride_list, mode_list =del_outliers(Stride_list,mode_list=mode_list,target_rng=[1,2],time_rng=[80,300])
        print("I dati originari erano: ",orig_len,", adesso sono:",len(Stride_list),"; Abbiamo eliminato ",orig_len-len(Stride_list)," stride.\n")
    return Stride_list, mode_list