In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

## Diabete

Diabetes files consist of four fields per record. Each field is separated by a tab and each record is separated by a newline.

File Names and format:\
(1) Date in MM-DD-YYYY format\
(2) Time in XX:YY format\
(3) Code\
(4) Value

The Code field is deciphered as follows:

33 = Regular insulin dose\
34 = NPH insulin dose\
35 = UltraLente insulin dose\
48 = Unspecified blood glucose measurement\
57 = Unspecified blood glucose measurement\
58 = Pre-breakfast blood glucose measurement\
59 = Post-breakfast blood glucose measurement\
60 = Pre-lunch blood glucose measurement\
61 = Post-lunch blood glucose measurement\
62 = Pre-supper blood glucose measurement\
63 = Post-supper blood glucose measurement\
64 = Pre-snack blood glucose measurement\
65 = Hypoglycemic symptoms\
66 = Typical meal ingestion\
67 = More-than-usual meal ingestion\
68 = Less-than-usual meal ingestion\
69 = Typical exercise activity\
70 = More-than-usual exercise activity\
71 = Less-than-usual exercise activity\
72 = Unspecified special event 

In [185]:
path = 'datasets/diabete/diabetes-data/'

**numpy data:** data_diabete

**clusters:** no labels, but it might be interesting to see how the algorithm clusters the data

In [194]:
data_diabete = list()

for i in range(70):
    nb = str(i+1)
    data_pd = pd.read_csv(path+'data-'+nb.zfill(2), sep='\t', usecols = [1,2,3] , 
                       engine='python', header=None)
    data_diabete.append(data_pd.to_numpy())
    
data_diabete[0]

array([['9:09', 58, 100],
       ['9:09', 33, 9],
       ['9:09', 34, 13],
       ...,
       ['7:20', 58, 110],
       ['7:20', 33, 9],
       ['7:20', 34, 16]], dtype=object)

## Mobile robot

The data is stored in three text files: one file for experiences in which the Pioneer was moving in a straight line, one in which it was turning in place, and one in which it was raising or lowering its gripper.

The description variable is a string of symbols. The string breaks down as follows:

"u" or "o" - unobstructed or obstructed\
"x.xs" - activity lasted x.x seconds\
activity - the activity and speed, if applicable, i.e. move100 = move forward at 100mm/sec\
visual - objects in the visual array are listed in sequence. "cAHEAD" indicates an object visible to channel c directly AHEAD of the Pioneer.\
\[visual.X\] - visual descriptions followed by a '.' and one character indicate that something special happens with the visible object. .V means the object Vanishes from sight during the activity. .D indicates that the object is Discovered (becomes visible) during the activity. .P indicates that the object is pushed.

An example: "u-3.5s-retr-100-aRIGHT.D" An unobstructed retreat (move) at -100 mm/sec for 3.5 seconds with an object being discovered in channel A. 

There are 102 move experiences, 42 turn experiences, 16 gripper experiences.

In [1]:
path = 'datasets/pioneers/'

In [4]:
data_pd = pd.read_csv(path+'MOVE.DATA', sep=',', engine='python', header=None)
data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,MOVE-TRIALT148,u-3.5s-retr-250-cAHEAD,1525.0,11.8,345.6,1276.0,1636.0,808.0,445.0,459.0,...,0.0,0.0,10000.0,140.0,0.0,0.0,0.0,10000.0,140.0,0.0
1,MOVE-TRIALT148,u-3.5s-retr-250-cAHEAD,1525.1,11.8,345.6,1276.0,1636.0,808.0,452.0,452.0,...,5.0,0.0,223.5,-102.0,188.0,0.0,0.0,10000.0,140.0,0.0
2,MOVE-TRIALT148,u-3.5s-retr-250-cAHEAD,1525.2,11.8,345.6,1276.0,1629.0,815.0,452.0,452.0,...,5.0,1.0,263.8,-86.0,169.0,0.0,0.0,10000.0,140.0,0.0
3,MOVE-TRIALT148,u-3.5s-retr-250-cAHEAD,1525.3,11.8,345.6,1276.0,1629.0,815.0,452.0,452.0,...,0.0,0.0,10000.0,140.0,0.0,0.0,0.0,10000.0,140.0,0.0
4,MOVE-TRIALT148,u-3.5s-retr-250-cAHEAD,1525.4,11.8,345.6,1276.0,1629.0,811.0,452.0,459.0,...,0.0,0.0,10000.0,140.0,0.0,0.0,0.0,10000.0,140.0,0.0


In [5]:
exp_names = np.unique(data_pd[0])

data_pioneers0 = list()
for i in range(exp_names.shape[0]):
    exp_n = data_pd[data_pd[0]==exp_names[i]]
    exp_n = exp_n.drop(columns=[0,1,2])
    data_pioneers0.append(exp_n.to_numpy())

#label 0: MOVE experience
labels_MOVE = np.zeros(len(data_pioneers0))
data_pioneers0[0]

array([[   12.7,   356.5,  1264. , ...,   966.4,    95. ,    71. ],
       [   12.7,   356.5,  1275. , ...,   966.4,    94. ,    71. ],
       [   12.7,   356.5,  1275. , ..., 10000. ,   140. ,     0. ],
       ...,
       [   12.7,   358.6,  1284. , ..., 10000. ,   140. ,     0. ],
       [   12.7,   358.6,  1285. , ..., 10000. ,   140. ,     0. ],
       [   12.7,   358.6,  1285. , ..., 10000. ,   140. ,     0. ]])

In [6]:
data_pd = pd.read_csv(path+'GRIPPER.DATA', sep=',', engine='python', header=None)
data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,GRIPPER-TRIALT176,gripper-UP[DOWN]--cAHEAD,693.3,12.4,5.3,1466.0,5201.0,204.0,203.0,206.0,...,6.0,20.0,3621.9,13.0,23.0,5.0,4.0,418.8,-97.0,125.0
1,GRIPPER-TRIALT176,gripper-UP[DOWN]--cAHEAD,693.4,12.4,5.3,1466.0,5201.0,196.0,212.0,206.0,...,6.0,19.0,3535.7,13.0,24.0,5.0,5.0,421.4,-97.0,124.0
2,GRIPPER-TRIALT176,gripper-UP[DOWN]--cAHEAD,693.5,12.4,5.3,1466.0,5201.0,196.0,212.0,206.0,...,6.0,20.0,3452.9,13.0,24.0,6.0,3.0,421.4,-97.0,125.0
3,GRIPPER-TRIALT176,gripper-UP[DOWN]--cAHEAD,693.6,12.4,5.3,1466.0,5201.0,196.0,196.0,206.0,...,6.0,21.0,3373.1,12.0,24.0,5.0,4.0,418.8,-97.0,125.0
4,GRIPPER-TRIALT176,gripper-UP[DOWN]--cAHEAD,693.7,12.4,5.3,1466.0,5201.0,203.0,196.0,206.0,...,6.0,20.0,3452.9,13.0,24.0,5.0,4.0,418.8,-97.0,125.0


In [7]:
exp_names = np.unique(data_pd[0])

data_pioneers1 = list()
for i in range(exp_names.shape[0]):
    exp_n = data_pd[data_pd[0]==exp_names[i]]
    exp_n = exp_n.drop(columns=[0,1,2])
    data_pioneers1.append(exp_n.to_numpy())

#label 1: GRIPPER experience
labels_GRIP = np.ones(len(data_pioneers1))
labels = np.concatenate((labels_MOVE,labels_GRIP))
data_pioneers = data_pioneers0 + data_pioneers1
data_pioneers1[0]

array([[   12.1,     0. ,  1477. , ..., 10000. ,   140. ,     0. ],
       [   12.1,     0. ,  1477. , ..., 10000. ,   140. ,     0. ],
       [   12.1,     0. ,  1477. , ..., 10000. ,   140. ,     0. ],
       ...,
       [   12.4,     0. ,  1477. , ..., 10000. ,   140. ,     0. ],
       [   12.4,     0. ,  1477. , ..., 10000. ,   140. ,     0. ],
       [   12.4,     0. ,  1477. , ..., 10000. ,   140. ,     0. ]])

In [8]:
data_pd = pd.read_csv(path+'TURN.DATA', sep=',', engine='python', header=None)
data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,TURN-TRIALT168,o-3.0s-turn100-cRIGHT,1620.9,11.8,234.1,1685.0,5201.0,5201.0,2104.0,5201.0,...,7.0,0.0,153.2,-10.0,238.0,8.0,2.0,1165.7,-70.0,66.0
1,TURN-TRIALT168,o-3.0s-turn100-cRIGHT,1621.0,11.8,234.1,1685.0,5201.0,5201.0,2104.0,5201.0,...,0.0,0.0,10000.0,140.0,0.0,11.0,8.0,1165.7,-74.0,63.0
2,TURN-TRIALT168,o-3.0s-turn100-cRIGHT,1621.1,11.8,234.1,1685.0,5201.0,5201.0,2104.0,5201.0,...,0.0,0.0,10000.0,140.0,0.0,14.0,7.0,1409.7,-63.0,56.0
3,TURN-TRIALT168,o-3.0s-turn100-cRIGHT,1621.2,11.8,234.1,1685.0,5201.0,5201.0,2104.0,5201.0,...,0.0,0.0,10000.0,140.0,0.0,15.0,7.0,1302.9,-62.0,59.0
4,TURN-TRIALT168,o-3.0s-turn100-cRIGHT,1621.3,11.8,234.1,1685.0,5201.0,5201.0,2104.0,5201.0,...,0.0,0.0,10000.0,140.0,0.0,22.0,6.0,1390.9,-66.0,57.0


**numpy data:** data_pioneers

**clusters:** labels

In [9]:
exp_names = np.unique(data_pd[0])

data_pioneers2 = list()
for i in range(exp_names.shape[0]):
    exp_n = data_pd[data_pd[0]==exp_names[i]]
    exp_n = exp_n.drop(columns=[0,1,2])
    data_pioneers2.append(exp_n.to_numpy())

#label 2: TURN experience
labels_TURN = np.ones(len(data_pioneers2))*2
#final list of labels
labels = np.concatenate((labels,labels_TURN))
#final list of data
data_pioneers = data_pioneers + data_pioneers2
data_pioneers2[0]

array([[   11.7,   357.9,  2031. , ..., 10000. ,   140. ,     0. ],
       [   12. ,   357.9,  2031. , ..., 10000. ,   140. ,     0. ],
       [   12. ,   357.9,  2024. , ..., 10000. ,   140. ,     0. ],
       ...,
       [   11.7,   192.3,   908. , ..., 10000. ,   140. ,     0. ],
       [   11.7,   195.8,   908. , ...,  4711.5,    30. ,    28. ],
       [   11.7,   199.3,   905. , ..., 10000. ,   140. ,     0. ]])

## Activity recognition


Dataset summary:\
#Activities: 33\
#Sensors: 9\
#Subjects: 17\
#Scenarios: 3 

The first to columns correspond to the timestamp while the last column corresponds to the
activity label.

ACTIVITY SET:\
A1: Walking\
A2: Jogging\
A3: Running\
A4: Jump up\
A5: Jump front & back\
A6: Jump sideways\
A7: Jump leg/arms open/closed\
A8: Jump rope\
A9: Trunk twist (arms outstretched)\
A10: Trunk twist (elbows bent)\
A11: Waist bends forward\
A12: Waist rotation\
A13: Waist bends (reach foot with opposite hand)\
A14: Reach heels backwards\
A15: Lateral bend (10_ to the left + 10_ to the right)\
A16: Lateral bend with arm up (10_ to the left + 10_ to the right)\
A17: Repetitive forward stretching\
A18: Upper trunk and lower body opposite twist\
A19: Lateral elevation of arms\
A20: Frontal elevation of arms\
A21: Frontal hand claps\
A22: Frontal crossing of arms\
A23: Shoulders high-amplitude rotation\
A24: Shoulders low-amplitude rotation\
A25: Arms inner rotation\
A26: Knees (alternating) to the breast\
A27: Heels (alternating) to the backside\
A28: Knees bending (crouching)\
A29: Knees (alternating) bending forward\
A30: Rotation on the knees\
A31: Rowing\
A32: Elliptical bike\
A33: Cycling 

The rows of the log file correspond to the consecutive samples of the mesurments sampled at 50 Hz. Each log file contains 120 columns.

The columns in between the 3rd and 119th column correspond to the sensor
measurements. There are 9 sensors in total with 13 modalities each.

In [145]:
path = 'datasets/activity_recognition/realistic_sensor_displacement/'
files = os.listdir(path)

**numpy data:** data_activity

**clusters:** labels

In [149]:
#à voir si on l'utilise, les fichiers sont très gros, on peut peut-être en prendre qu'une partie
"""
labels = list()
data_activity = list()
for f in files:
    if '.log' in f:
        data_pd = pd.read_csv(path+f, sep='\t', engine='python', header=None)
        labels.append(data_pd[119][0])
        data_activity.append(data_pd.drop(columns=[0,119]).to_numpy())
        """


"\nlabels = list()\ndata_activity = list()\nfor f in files:\n    if '.log' in f:\n        data_pd = pd.read_csv(path+f, sep='\t', engine='python', header=None)\n        labels.append(data_pd[119][0])\n        data_activity.append(data_pd.drop(columns=[0,119]).to_numpy())\n        "

## Gas sensors

100 recordings of a sensor array under different conditions in a home setting: background, wine and banana presentations. The array includes 8 MOX gas sensors, and humidity and temperature sensors.

This dataset contains a set of time series from three different conditions: wine, banana and background activity. There are 36 inductions with wine, 33 with banana and 31 recordings of background activity. One possible application is to discriminate among background, wine and banana.

For each induction, we include one hour of background activity prior to and after the stimulus presentation. Time series were recorded at one sample per second, with minor variations at some data points due to issues in the wireless communication.

In [163]:
path = 'datasets/wine_banana/HT_Sensor_UCIsubmission/'

In [164]:
metadata_pd = pd.read_csv(path+'HT_Sensor_metadata.dat', sep='\t', usecols=[0,2],
                          names=['id','class'], engine='python', skiprows=[0])
metadata_pd.head()

Unnamed: 0,id,class
0,0,banana
1,1,wine
2,2,wine
3,3,banana
4,4,wine


**numpy data:** data_gas

**clusters:** labels

In [165]:
name = metadata_pd['class']

name_unique = list(np.unique(name))
dict_name = dict(zip(name_unique, list(range(len(name_unique)))))

labels = np.array([dict_name.get(nom) for nom in name])

In [166]:
data_pd = pd.read_csv(path+'HT_Sensor_dataset.dat', sep='  ', engine='python', skiprows=[0], header=None)

data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,-0.99975,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528
1,0,-0.999472,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299
2,0,-0.999194,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093
3,0,-0.998916,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905
4,0,-0.998627,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736


In [167]:
ids = metadata_pd['id']
data_gas = list()

for i in range(ids.shape[0]):
    data_id = data_pd[data_pd[0]==ids[i]]
    data_id = data_id.drop(columns=[0])
    data_id = data_id[data_id[1] >= 0]
    data_gas.append(data_id.to_numpy())

## Sign language

Samples from a single signer (a native Auslan signer) were collected over a period of nine weeks. In total, 27 samples per sign, and a total of 2565 signs were collected. The average length of each sign was approximately 57 frames. 

The file consists of 9 subdirectories tctodd1-9. Each directory consists of 3 samples of each sign, captured on a different day. In total there are 95 different signs, with 27 samples per sign. Signs were provided by a native signer volunteer.

Each file consists of a sequence of lines. Each line consists of 22 whitespace-separated numbers representing the 22 channels of information. The list of channels can be found in the domain description file. It also lists the classes.

The following data were recorded for each hand:

x position expressed relative to a zero point set slightly below the chin. Expressed in meters.\
y position expressed relative to a zero point set slightly below the chin. Expressed in meters.\
z position expressed relative to a zero point set slightly below the chin. Expressed in meters.\
roll expressed as a value between -0.5 and 0.5 with 0 being palm down. Positive means the palm is rolled clockwise from the perspective of the signer. To get degrees, multiply by 180.\
pitch expressed as a value between -0.5 and 0.5 with 0 being palm flat (horizontal). Positive means the palm is pointing up. To get degrees, multiply by 180.\
yaw expressed a value between -1.0 and 1.0 with 0 being palm straight ahead from the perspective of the signer.\ Positive means clockwise from the perspective above the signer. To get degrees, multiply by 180.\
Thumb bend measure between 0 and 1. 0 means totally flat, 1 means totally bent. However, the finger bend measurements are not very exact.\
Forefinger bend measure between 0 and 1. 0 means totally flat, 1 means totally bent. However, the finger bend measurements are not very exact.\
Middle finger bend measure between 0 and 1. 0 means totally flat, 1 means totally bent. However, the finger bend measurements are not very exact.\
Ring finger bend measure between 0 and 1. 0 means totally flat, 1 means totally bent. However, the finger bend measurements are not very exact.\
Little finger bend measure between 0 and 1. 0 means totally flat, 1 means totally bent. However, the finger bend measurements are not very exact.





In [171]:
path = 'datasets/sign_language/tctodd/'

#list of lists, of all files per directory
directories = os.listdir(path)
directories = np.sort(directories)
files = list()
for direc in directories:
    f=os.listdir(path+direc)
    f = np.sort(f)
    f = [path+direc+'/'+name for name in f]
    files.append(f)

In [172]:
nb_signs = 95
sample_per_sign_per_file = 3
signs = list()

#create list of lists, of samples per signs (95 signs, and 27 samples per sign)
for i in range(nb_signs):
    sample_per_sign = list()
    for l in range(len(files)):
        for j in range(sample_per_sign_per_file):
            sample_per_sign.append(files[l][i*3+j])
    signs.append(sample_per_sign)

In [158]:
signs[0] #one list in the list 'signs' per sign

['datasets/sign_language/tctodd/tctodd1/God-1.tsd',
 'datasets/sign_language/tctodd/tctodd1/God-2.tsd',
 'datasets/sign_language/tctodd/tctodd1/God-3.tsd',
 'datasets/sign_language/tctodd/tctodd2/God-1.tsd',
 'datasets/sign_language/tctodd/tctodd2/God-2.tsd',
 'datasets/sign_language/tctodd/tctodd2/God-3.tsd',
 'datasets/sign_language/tctodd/tctodd3/God-1.tsd',
 'datasets/sign_language/tctodd/tctodd3/God-2.tsd',
 'datasets/sign_language/tctodd/tctodd3/God-3.tsd',
 'datasets/sign_language/tctodd/tctodd4/God-1.tsd',
 'datasets/sign_language/tctodd/tctodd4/God-2.tsd',
 'datasets/sign_language/tctodd/tctodd4/God-3.tsd',
 'datasets/sign_language/tctodd/tctodd5/God-1.tsd',
 'datasets/sign_language/tctodd/tctodd5/God-2.tsd',
 'datasets/sign_language/tctodd/tctodd5/God-3.tsd',
 'datasets/sign_language/tctodd/tctodd6/God-1.tsd',
 'datasets/sign_language/tctodd/tctodd6/God-2.tsd',
 'datasets/sign_language/tctodd/tctodd6/God-3.tsd',
 'datasets/sign_language/tctodd/tctodd7/God-1.tsd',
 'datasets/s

In [173]:
data_pd = pd.read_csv(signs[0][0], sep='\t', engine='python', header=None)
data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.064909,0.034318,-0.043964,0.626383,0.499976,0.506653,0.648993,1.0,0.94951,0.965124,...,0.073279,-0.024444,0.233232,0.560134,0.606738,0.685484,0.758431,1.0,0.754117,1.0
1,-0.033878,0.034764,-0.032445,0.647282,0.486303,0.480203,0.915143,1.0,0.915993,1.0,...,0.065376,-0.009397,0.194742,0.585891,0.63296,1.0,0.91647,1.0,0.97451,1.0
2,0.015014,0.030924,-0.012665,0.646525,0.478735,0.466018,0.993164,1.0,0.911142,1.0,...,0.040059,-0.006763,0.18384,0.576174,0.662819,1.0,0.999608,1.0,1.0,1.0
3,0.019613,0.030433,-0.000207,0.643083,0.48413,0.464675,1.0,1.0,0.940249,1.0,...,0.035996,-0.004575,0.192532,0.571828,0.666005,1.0,1.0,1.0,1.0,1.0
4,0.019479,0.057759,0.007249,0.646513,0.512036,0.459853,1.0,1.0,0.960536,1.0,...,0.099354,-0.020828,0.204154,0.586355,0.656374,1.0,0.961176,1.0,1.0,1.0


**numpy data:** data_signs

**clusters:** labels

In [174]:
data_signs = list()

for i in range(len(signs)):
    for j in range(27):
        sign = pd.read_csv(signs[i][j], sep='\t', engine='python', header=None)
        data_signs.append(sign.to_numpy())

In [161]:
signs_names = list()

for i in range(nb_signs):
    for k in range(27):
        signs_names.append(signs[i][0].split('/')[-1][:-6])

In [162]:
unique_names = np.unique(signs_names)

dict_name = dict(zip(unique_names, list(range(len(unique_names)))))

labels = np.array([dict_name.get(nom) for nom in signs_names])