In [1]:
import sys
sys.path.append('..')
from src.read_databases import read_database4, read_database3, get_database4
import pandas as pd
import numpy as np
import h5py
import yaml

In [2]:
def include_metadata(dado, database_id, pathcwd, **kwargs):
    import os
    config = yaml.safe_load(open(pathcwd+'params.yaml'))
    path = config['data_path']['dado'+str(database_id)]
    subj_string = 's'+str(kwargs['subj'])
    dado['database_id'] = database_id
    dado['database_id'] = dado['database_id'].astype('category')
    dado['n_dim1'] = kwargs['h'] if 'h' in kwargs.keys() else np.nan
    dado['n_dim1'] = dado['n_dim1'].astype('category')
    dado['n_dim2'] = kwargs['w'] if 'w' in kwargs.keys() else np.nan
    dado['n_dim2'] = dado['n_dim2'].astype('category')
    dado['subject'] = kwargs['subj'] if 'subj' in kwargs.keys() else np.nan
    dado['subject'] = dado['subject'].astype('category')
    segment = kwargs['segment'] if 'segment' in kwargs.keys() else np.nan
    dado['segment'] = segment
    dado['segment'] = dado['segment'].astype('category')
    dado['trial'] = kwargs['trial'] if 'trial' in kwargs.keys() else np.nan
    dado['trial'] = dado['trial'].astype('category')
    task = kwargs['task'] if 'task' in kwargs.keys() else np.nan
    if task == 'p': dado['task'] = 'pronation'
    if task == 'e': dado['task'] = 'extension'
    if task == 's': dado['task'] = 'supination'
    if task == 'f': dado['task'] = 'flexion'
    dado['task'] = dado['task'].astype('category')
    dado['intensity'] = kwargs['intensity'] if 'intensity' in kwargs.keys() else np.nan
    dado['intensity'] = dado['intensity'].astype('category')
    dado['frequency_sample'] = kwargs['fs'] if 'fs' in kwargs.keys() else np.nan
    dado['frequency_sample'] = dado['frequency_sample'].astype('category')
    orig_ndim1 = config['database_params'][f'dado{database_id}'][f'nrows_{kwargs["segment"]}']
    dado['original_n_dim1'] = orig_ndim1
    dado['original_n_dim1'] = dado['original_n_dim1'].astype('category')
    if f'ncolumns_{kwargs["segment"]}' in config['database_params'][f'dado{database_id}'].keys():
        dado['original_n_dim2'] = config['database_params'][f'dado{database_id}'][f'ncolumns_{kwargs["segment"]}']
        
    else:
        nchannels_table = pd.read_csv(pathcwd+path+'nchannels.txt', sep='\s+')
        nchannels = nchannels_table.query('subject==@subj_string')[kwargs["segment"]].values[0]
        orig_ndim2 = nchannels/orig_ndim1
        dado['original_n_dim2'] = orig_ndim2
    dado['original_n_dim2'] = dado['original_n_dim2'].astype('category')
    if os.path.exists(pathcwd+path+'SubjectsDescription.txt'):
        meta_subjects = pd.read_csv(pathcwd+path+'SubjectsDescription.txt', sep='\t')        
        meta_subjects = meta_subjects.query('Subject==@subj_string')
        for column in list(meta_subjects.columns)[1:]:
            if column.find('circumference') < 0 and column.find('length') < 0 :
                dado[column] = meta_subjects[column].values[0]
                dado[column] = dado[column].astype('category')
            elif column.find('circumference') >= 0 and column.find(segment.capitalize()) >=0:
                dado['circumference (cm)'] = meta_subjects[column].values[0]
                dado['circumference (cm)'] = dado['circumference (cm)'].astype('category')
            elif column.find('length (cm)') >= 0 and column.find(segment.capitalize()) >=0:
                dado['length (cm)'] = meta_subjects[column].values[0]
                dado['length (cm)'] = dado['length (cm)'].astype('category')
    if os.path.exists(pathcwd+path+'ReferencePoints.txt'):
        header1 = pd.read_csv(pathcwd+path+'ReferencePoints.txt', sep='\t', index_col=0, nrows=0)        
        header1 = list(header1.columns)
        header = [h1 + '_' + h2 for h1 in header1[::2] for h2 in ['x', 'y']]        
        meta_reference = pd.read_csv(pathcwd+path+'ReferencePoints.txt', sep='\t', index_col=0, 
                                     skiprows=1, header=None, names=header) 
        meta_reference = meta_reference.loc[subj_string]
        if segment == 'biceps' or segment == 'triceps':
            dado['reference_x'] = meta_reference[segment.capitalize()+'_x']
            dado['reference_y'] = meta_reference[segment.capitalize()+'_y']
            dado['reference_x'] = dado['reference_x'].astype('category')
            dado['reference_y'] = dado['reference_y'].astype('category')
        if segment == 'forearm':
            for muscle in ['Brachio Radialis','Anconeus','Pronator Teres']:
                dado['reference_'+muscle+'_x'] = meta_reference[muscle+'_x']
                dado['reference_'+muscle+'_y'] = meta_reference[muscle+'_y']
                dado['reference_'+muscle+'_x'] = dado['reference_'+muscle+'_x'].astype('category')
                dado['reference_'+muscle+'_y'] = dado['reference_'+muscle+'_y'].astype('category')
    return dado

In [3]:
def transform_database(h, w, subj, type, task, intensity, trial,  database_id, pathcwd=''):
    if database_id == 4: 
        get_database = get_database4
        read_database = read_database4
    db_size, dado_orig = get_database(subj, type, task, intensity, trial, pathcwd='../')
    config = yaml.safe_load(open(pathcwd+'params.yaml'))
    path = config['data_path']['dado4']
    n_channels_file = pd.read_csv(pathcwd+path+'nchannels.txt', sep='\s+', index_col='subject')
    subj_string = 's'+str(subj)
    n_channels = n_channels_file.loc[subj_string, type]   
    dado_orig = dado_orig.reshape(n_channels,-1)
    dado_orig = dado_orig.T
    column_names = np.arange(0, h*w)
    column_names = list(column_names)
    column_names = [str(d) for d in column_names]
    dado = pd.DataFrame(np.zeros((db_size,h*w)), columns=column_names)
    for t in range(db_size):
        dado.iloc[t, :] = read_database(dado_orig, t, h, w, subj, type, task, intensity, trial, pathcwd='../').reshape(1,-1)
    dado = include_metadata(dado, database_id, '../', h=h, w=w, subj=subj, task=task, segment=type, intensity=intensity, fs=2048)
    return dado

In [4]:
h = 9
w = 14

for subject in [3,1,2,4,5,6,7,8,9,10,11,12]:
    for segment in ['forearm', 'biceps', 'triceps']:
        for task in ['s','e','f','p']:
            for intensity in [10,30, 50]:
                print(f'{subject=},{segment=},{task=},{intensity=}')
                dado_new = transform_database(h, w, subject, segment, task, intensity, None, database_id=4, pathcwd='../')
                if 'dado' in locals():
                    dado = pd.concat([dado, dado_new], join='outer')
                else:
                    dado = dado_new.copy()

dado['Dataset URL'] = 'https://www.nature.com/articles/s41597-020-00717-6#Sec7'
dado['Trial'] = 1

subject=3,segment='forearm',task='s',intensity=10


FileNotFoundError: [Errno 2] No such file or directory: 'params.yaml'

In [21]:
dado.columns = list(np.array(list(dado.columns)))
dado = [str(d) for d in dado.columns]

In [22]:
dado.to_parquet('dado4.parquet', engine='fastparquet', index=False) 

AttributeError: 'list' object has no attribute 'to_parquet'

In [9]:
dado.to_csv('dado4.csv', index=False)

OSError: [Errno 28] No space left on device

In [14]:
dado

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,reference_Brachio Radialis_x,reference_Brachio Radialis_y,reference_Anconeus_x,reference_Anconeus_y,reference_Pronator Teres_x,reference_Pronator Teres_y,reference_x,reference_y,Dataset URL,Trial
0,-0.184457,0.002652,0.026943,-0.060069,0.006547,0.064378,0.052650,0.006263,0.019386,0.046576,...,17,1,0,1,11,1,,,https://www.nature.com/articles/s41597-020-007...,1
1,0.019815,-0.033838,-0.066572,-0.013479,0.036679,-0.006372,-0.048370,0.009475,-0.052250,-0.021341,...,17,1,0,1,11,1,,,https://www.nature.com/articles/s41597-020-007...,1
2,-0.019153,-0.035226,0.029362,0.006561,0.016036,-0.001061,0.011895,-0.034078,-0.007353,-0.033141,...,17,1,0,1,11,1,,,https://www.nature.com/articles/s41597-020-007...,1
3,-0.026892,-0.016458,0.043647,0.046082,0.037340,0.000872,0.008312,-0.017090,0.029442,-0.011037,...,17,1,0,1,11,1,,,https://www.nature.com/articles/s41597-020-007...,1
4,-0.018681,0.013798,-0.014234,0.077566,0.052701,0.006147,0.003037,0.024436,0.026202,0.017061,...,17,1,0,1,11,1,,,https://www.nature.com/articles/s41597-020-007...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20475,0.000999,-0.001088,-0.014374,0.038713,-0.021772,-0.049294,0.039464,-0.006544,0.030964,0.021969,...,,,,,,,7,5,https://www.nature.com/articles/s41597-020-007...,1
20476,-0.010530,0.035573,-0.017738,0.005706,-0.051191,-0.051387,-0.009646,-0.007663,-0.001564,-0.001533,...,,,,,,,7,5,https://www.nature.com/articles/s41597-020-007...,1
20477,0.009956,0.026472,-0.025328,0.034209,0.018109,0.001879,0.011100,0.010004,0.022318,0.003120,...,,,,,,,7,5,https://www.nature.com/articles/s41597-020-007...,1
20478,0.023492,-0.006252,-0.048098,0.018010,-0.015309,-0.004200,0.027722,0.002163,-0.018483,-0.011741,...,,,,,,,7,5,https://www.nature.com/articles/s41597-020-007...,1


In [11]:
dado['segment']

0        forearm
1        forearm
2        forearm
3        forearm
4        forearm
          ...   
20475    triceps
20476    triceps
20477    triceps
20478    triceps
20479    triceps
Name: segment, Length: 1474560, dtype: object

In [105]:
dado.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40960 entries, 0 to 20479
Columns: 149 entries, 0 to reference_y
dtypes: category(19), float64(128), int64(1), object(1)
memory usage: 43.9 MB
