In [1]:
# import library for various processes with the OS
import os

## Load configuration

In [2]:
# import library for yaml handling
import yaml

In [3]:
config_path = os.path.join(os.getcwd(), "config.yml")

with open(config_path) as file:
    config = yaml.load(file, Loader=yaml.FullLoader)

In [4]:
# import library for hanlding the MongoDB client
import pymongo
# import library for retrieving datetime
from datetime import datetime
from natsort import natsorted

### Create the database

In [5]:
client = pymongo.MongoClient(config["client"])

In [6]:
db = client[config["db"]]

### Instantiate the collection

In [7]:
col = db[config["col"]]

## Create the data collection

Uploading the gathered data to MongoDB collection. The data directory structure should be as follows:

```
.
└── data/
    ├── class_A/
    │   ├── data_A_01.csv
    │   ├── data_A_02.csv
    │   └── ..
    ├── class_B/
    │   ├── data_B_01.csv
    │   ├── data_B_02.csv
    │   └── .
    └── class ...
```

In [8]:
# import library for hanlding the csv data and transformations
import pandas as pd

### Format data folder to the above schema/convention

In [9]:
from utils import format_wrist_data

In [10]:
data_path = config['data_path']
data_path

'C:\\Users\\gkont\\Desktop\\AIoT-2023-Team-05\\ASL'

In [12]:
format_wrist_data(data_path)

Class 'CLASS_A' contains 100 files
50 total sessions after merging the axes
Class 'CLASS_B' contains 100 files
50 total sessions after merging the axes
Class 'CLASS_C' contains 100 files
50 total sessions after merging the axes
Class 'CLASS_D' contains 100 files
51 total sessions after merging the axes
Class 'CLASS_E' contains 100 files
50 total sessions after merging the axes
Class 'CLASS_F' contains 102 files
51 total sessions after merging the axes


### Load formated data

In [13]:
# Get data path
coll_path = os.path.join(os.getcwd(), 'data')
coll_path

'C:\\Users\\gkont\\Desktop\\AIoT-2023-Team-05\\data'

In [14]:
# List all files in a path
classes_folders_list = [f for f in os.listdir(coll_path) if os.path.isdir(os.path.join(coll_path, f))]
print(classes_folders_list)

['CLASS_A', 'CLASS_B', 'CLASS_C', 'CLASS_D', 'CLASS_E', 'CLASS_F']


### Empty database

In [15]:
# Delete all documents in the collection
result = col.delete_many({})

# Print the number of deleted documents
print(f"Deleted {result.deleted_count} documents.")

Deleted 31 documents.


Each document in the MongoDB database have the following schema:

```json
{
  "data": {
    "acc_x": ["array", "of", "values"],
    "acc_y": ["array", "of", "values"],
    "acc_z": ["array", "of", "values"],
  },
  "label": "The label of the instance",
  "datetime": "MongoDB datetime object"
}
```

Accordingly, if you are using gyroscope or both accelerometer and gyroscope (like in our case), the following order and naming of the sensor keys should be defined:

* for gyroscope: `gyr_x`, `gyr_y`, `gyr_z` for the three axes
* for accelerometer and gyroscope: `acc_x`, `acc_y`, `acc_z`, `gyr_x`, `gyr_y`, `gyr_z` for the six axes

**Note: Be careful, the document is mandatory to have the aforementioned schema, in order to argue and proceed with the rest of the processes later on, in data engineering, plotting, etc.**

## Upload the data to MongoDB

In [16]:
cols = config['order']
cols

['x-axis (g)',
 'y-axis (g)',
 'z-axis (g)',
 'x-axis (deg/s)',
 'y-axis (deg/s)',
 'z-axis (deg/s)']

In [18]:
# for each folder data class
for cls in classes_folders_list:
    # define semi-complete path
    folder_path = os.path.join(coll_path, cls)
    
    # Validate and list all the files/samples of the particular class
    files_in_folder = natsorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])

    # for each file/sample
    for file in files_in_folder:
        # read as csv and formulate json format to upload in mongodb
        df_sample = pd.read_csv(os.path.join(folder_path, file))
        ax, ay, az, gx, gy, gz = [list(df_sample[col]) for col in cols]
        
        document = {"data": {
                        "acc_x": ax,
                        "acc_y": ay,
                        "acc_z": az,
                        "gyr_x": gx,
                        "gyr_y": gy,
                        "gyr_z": gz
                    },
                   "label": cls,
                   "datetime": datetime.now()
                }
        col.insert_one(document)

In [20]:
# upload check
col.find_one({"label": "CLASS_A"})['_id']

ObjectId('647088792228156f65027a14')