Improve dnn (#74)

* Update pose_estimor and face_descriptor_extractor * Fix heatmap_sigma type * Improve face alignment, fix face_descriptor extractor training. * Improve audio analyzer (ref #70). * Improve video analyzer (ref #68) * Update HBBA lite. * Update video analysis and audio analysis loggers. * Add HBBA strategies log to the perception logger. * Rename blur score to sharpness score. * Add distillation_loss_alpha to train_pose_estimator.py * Add distillation_loss_alpha to output path. * Fix trainers for criterion next_epoch. Fix export_yolo_v4. Add GPU support to align_faces.py and test_pose_estimator_with_yolo_v4.py. * Fix OpenFaceBackbone * Add yolov7 module. * Reformat code * Add yolov7 module. * Add missing eval in test_converted_yolo. * Fix YOLO V7. Add the choice of the yolo model in video_analyzer. * Add AudioSet support. Add MS-celeb-support. * Add margin to the output path. * Fix dnns * Set the odas frame in the midlle of the mics. * Use the face_sharpness_score in person_identification. Fix test launch files. * Update video_analyzer dnns export. * Add a model choice for FaceDescriptorExtractor. * Fix dnn_units tests. * Fix the realsense in capture_face.launch * Add PSLA pooling. Add backends. * Add mixup and VIT to backbone training. * Add missing changes for the backbone training. * Add passt_s_n models to the audio trainings. * Use the old networks for the face description. * Refactoring for descriptor_yolo_v7 and yolov7 trained on Objects365. * Fix test_face_descriptor_extractor * Fix descriptor Yolov7 * Fix descriptor yolov7 * Fix YOLOV7 trained on Objects365. * Fix video analyzer and export. * Fix issues. * Fix dnn_utils tests. * Fix dnn_utils tests.
introlab · Nov 15, 2023 · ae7d1f9 · ae7d1f9
1 parent 5571627
commit ae7d1f9
Show file tree

Hide file tree

Showing 148 changed files with 5,766 additions and 1,477 deletions.
diff --git a/documentation/tf/odas.png b/documentation/tf/odas.png
diff --git a/documentation/tf/tf.drawio b/documentation/tf/tf.drawio
@@ -1,6 +1,6 @@
-<mxfile host="app.diagrams.net" modified="2023-05-18T23:44:43.061Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0" etag="lR1TwZr0HA5rjEsmMCSQ" version="21.3.2" type="device">
+<mxfile host="app.diagrams.net" modified="2023-09-07T15:29:35.502Z" agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0" etag="UDr_84b8QlR3HeeEyIeO" version="21.6.8" type="device">
   <diagram name="Page-1" id="3laUdoB3Sh00mUV5PLZu">
-    <mxGraphModel dx="1562" dy="854" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
+    <mxGraphModel dx="1185" dy="675" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
       <root>
         <mxCell id="0" />
         <mxCell id="1" parent="0" />
@@ -264,12 +264,12 @@
         </mxCell>
         <mxCell id="4EcZqvmrcj0V483apFQy-74" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#dae8fc;strokeColor=#0000FF;strokeWidth=3;" parent="1" edge="1">
           <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="1395.68" y="723" as="sourcePoint" />
-            <mxPoint x="1395.68" y="663" as="targetPoint" />
+            <mxPoint x="1391.68" y="936" as="sourcePoint" />
+            <mxPoint x="1391.68" y="876" as="targetPoint" />
           </mxGeometry>
         </mxCell>
         <mxCell id="4EcZqvmrcj0V483apFQy-75" value="" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fillColor=#000000;" parent="1" vertex="1">
-          <mxGeometry x="1391.02" y="718" width="10" height="10" as="geometry" />
+          <mxGeometry x="1387.02" y="931" width="10" height="10" as="geometry" />
         </mxCell>
         <mxCell id="4EcZqvmrcj0V483apFQy-76" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#f8cecc;strokeColor=#FF0000;strokeWidth=3;" parent="1" edge="1">
           <mxGeometry width="50" height="50" relative="1" as="geometry">

diff --git a/ros/demos/control_panel/src/control_panel_node.cpp b/ros/demos/control_panel/src/control_panel_node.cpp
@@ -7,6 +7,7 @@
 #include <hbba_lite/core/DesireSet.h>
 #include <hbba_lite/core/RosFilterPool.h>
 #include <hbba_lite/core/GecodeSolver.h>
+#include <hbba_lite/core/RosStrategyStateLogger.h>
 #include <hbba_lite/core/HbbaLite.h>
 
 #include <t_top_hbba_lite/Strategies.h>
@@ -53,7 +54,8 @@ int startNode(int argc, char* argv[])
     }
 
     auto solver = make_unique<GecodeSolver>();
-    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
+    auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
+    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));
 
     QApplication application(argc, argv);
     ControlPanel controlPanel(nodeHandle, desireSet, camera2dWideEnabled);

diff --git a/ros/demos/home_logger/src/home_logger_node.cpp b/ros/demos/home_logger/src/home_logger_node.cpp
@@ -23,6 +23,7 @@
 #include <hbba_lite/core/RosFilterPool.h>
 #include <hbba_lite/core/GecodeSolver.h>
 #include <hbba_lite/core/HbbaLite.h>
+#include <hbba_lite/core/RosStrategyStateLogger.h>
 
 #include <t_top_hbba_lite/Strategies.h>
 
@@ -93,7 +94,8 @@ void startNode(
     strategies.emplace_back(createPlaySoundStrategy(filterPool, desireSet, nodeHandle));
 
     auto solver = make_unique<GecodeSolver>();
-    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
+    auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
+    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));
 
     VolumeManager volumeManager(nodeHandle);
     SQLite::Database database(databasePath, SQLite::OPEN_READWRITE | SQLite::OPEN_CREATE);

diff --git a/ros/demos/smart_speaker/src/smart_speaker_rss_node.cpp b/ros/demos/smart_speaker/src/smart_speaker_rss_node.cpp
@@ -21,6 +21,7 @@
 #include <hbba_lite/core/RosFilterPool.h>
 #include <hbba_lite/core/GecodeSolver.h>
 #include <hbba_lite/core/HbbaLite.h>
+#include <hbba_lite/core/RosStrategyStateLogger.h>
 
 #include <t_top_hbba_lite/Strategies.h>
 
@@ -59,7 +60,8 @@ void startNode(
     strategies.emplace_back(createPlaySoundStrategy(filterPool, desireSet, nodeHandle));
 
     auto solver = make_unique<GecodeSolver>();
-    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
+    auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
+    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));
 
     StateManager stateManager;
     type_index idleStateType(typeid(RssIdleState));

diff --git a/ros/demos/smart_speaker/src/smart_speaker_smart_node.cpp b/ros/demos/smart_speaker/src/smart_speaker_smart_node.cpp
@@ -19,6 +19,7 @@
 #include <hbba_lite/core/RosFilterPool.h>
 #include <hbba_lite/core/GecodeSolver.h>
 #include <hbba_lite/core/HbbaLite.h>
+#include <hbba_lite/core/RosStrategyStateLogger.h>
 
 #include <t_top_hbba_lite/Strategies.h>
 
@@ -61,7 +62,8 @@ void startNode(
     strategies.emplace_back(createPlaySoundStrategy(filterPool, desireSet, nodeHandle));
 
     auto solver = make_unique<GecodeSolver>();
-    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
+    auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
+    HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));
 
     StateManager stateManager;
     type_index askOtherTaskStateType(typeid(SmartAskOtherTaskState));

diff --git a/ros/perceptions/audio_analyzer/msg/AudioAnalysis.msg b/ros/perceptions/audio_analyzer/msg/AudioAnalysis.msg
@@ -1,5 +1,6 @@
 std_msgs/Header header
 
+int64 tracking_id
 audio_utils/AudioFrame audio_frame
 
 string[] audio_classes

diff --git a/ros/perceptions/audio_analyzer/scripts/audio_analyzer_node.py b/ros/perceptions/audio_analyzer/scripts/audio_analyzer_node.py
@@ -86,7 +86,7 @@ def _audio_cb(self, msg):
                 self._audio_analysis_count += audio_frame.shape[0]
 
     def _analyse(self):
-        audio_buffer = self._get_audio_buffer()
+        audio_buffer, sst_id = self._get_audio_buffer_and_sst_id()
         audio_descriptor_buffer = audio_buffer[-self._audio_descriptor_extractor.get_supported_duration():]
         audio_descriptor, audio_class_probabilities = self._audio_descriptor_extractor(audio_descriptor_buffer)
         audio_descriptor = audio_descriptor.tolist()
@@ -99,21 +99,22 @@ def _analyse(self):
             voice_descriptor = []
 
         audio_classes = self._get_audio_classes(audio_class_probabilities)
-        self._publish_audio_analysis(audio_buffer, audio_classes, audio_descriptor, voice_descriptor)
+        self._publish_audio_analysis(sst_id, audio_buffer, audio_classes, audio_descriptor, voice_descriptor)
 
-    def _get_audio_buffer(self):
+    def _get_audio_buffer_and_sst_id(self):
         with self._audio_frames_lock:
+            sst_id = self._sst_id
             audio_buffer = torch.cat(self._audio_frames, dim=0)
         if audio_buffer.size()[0] < self._audio_buffer_duration:
-            return torch.cat([torch.zeros(self._audio_buffer_duration - audio_buffer.size()[0]), audio_buffer], dim=0)
+            return torch.cat([torch.zeros(self._audio_buffer_duration - audio_buffer.size()[0]), audio_buffer], dim=0), sst_id
         else:
-            return audio_buffer[-self._audio_buffer_duration:]
+            return audio_buffer[-self._audio_buffer_duration:], sst_id
 
     def _get_audio_classes(self, audio_class_probabilities):
         return [self._class_names[i] for i in range(len(self._class_names))
                 if audio_class_probabilities[i].item() >= self._class_probability_threshold]
 
-    def _publish_audio_analysis(self, audio_buffer, audio_classes, audio_descriptor, voice_descriptor):
+    def _publish_audio_analysis(self, sst_id, audio_buffer, audio_classes, audio_descriptor, voice_descriptor):
         with self._audio_direction_lock:
             frame_id, direction_x, direction_y, direction_z = self._audio_direction
 
@@ -122,6 +123,8 @@ def _publish_audio_analysis(self, audio_buffer, audio_classes, audio_descriptor,
         msg.header.stamp = rospy.Time.now()
         msg.header.frame_id = frame_id
 
+        msg.tracking_id = sst_id
+
         msg.audio_frame.format = 'float'
         msg.audio_frame.channel_count = SUPPORTED_CHANNEL_COUNT
         msg.audio_frame.sampling_frequency = self._supported_sampling_frequency

diff --git a/ros/perceptions/person_identification/README.md b/ros/perceptions/person_identification/README.md
@@ -13,6 +13,7 @@ roslaunch person_identification capture_face.launch name:=<person_name> neural_n
 #### Parameters
  - `name` (string): The person name.
  - `mean_size` (int): How many descriptor to average.
+ - `face_sharpness_score_threshold` (double): The threshold to consider the face sharp enough.
 
 #### Subscribed Topics
  - `video_analysis` ([video_analyzer/VideoAnalysis](../video_analyzer/msg/VideoAnalysis.msg)): The video analysis containing the detected objects.
@@ -38,6 +39,7 @@ roslaunch person_identification capture_voice.launch name:=<person_name> neural_
 This node performs person identification. The people must be already added to `people.json` with the previous nodes.
 
 #### Parameters
+ - `face_sharpness_score_threshold` (double): The threshold to consider the face sharp enough.
  - `face_descriptor_threshold` (double): The maximum distance between two face descriptors to be considered the same person.
  - `voice_descriptor_threshold` (double): The maximum distance between two voice descriptors to be considered the same person.
  - `face_voice_descriptor_threshold` (double): The maximum distance between two merged descriptors to be considered the same person.

diff --git a/ros/perceptions/person_identification/launch/capture_face.launch b/ros/perceptions/person_identification/launch/capture_face.launch
@@ -2,23 +2,32 @@
     <arg name="name"/>
     <arg name="neural_network_inference_type"/> <!-- Options: cpu, torch_gpu or trt_gpu -->
 
-    <include file="$(find realsense2_camera)/launch/rs_camera.launch">
+    <include file="$(find realsense2_camera)/launch/rs_aligned_depth.launch">
+        <arg name="fisheye_width" value="1280"/>
+        <arg name="fisheye_height" value="720"/>
+
+        <arg name="depth_width" value="1280"/>
+        <arg name="depth_height" value="720"/>
+
+        <arg name="infra_width" value="1280"/>
+        <arg name="infra_height" value="720"/>
+
         <arg name="color_width" value="1280"/>
         <arg name="color_height" value="720"/>
-        <arg name="align_depth" value="true"/>
 
-        <arg name="color_fps" value="30"/>
-        <arg name="infra_fps" value="30"/>
-        <arg name="depth_fps" value="30"/>
-
-        <arg name="filters" value="pointcloud"/>
+        <arg name="color_fps" value="15"/>
+        <arg name="infra_fps" value="15"/>
+        <arg name="depth_fps" value="15"/>
 
         <arg name="enable_gyro" value="false"/>
         <arg name="enable_accel" value="false"/>
+
+        <arg name="camera" value="camera_3d"/>
     </include>
 
     <node pkg="video_analyzer" type="video_analyzer_3d_node.py" name="video_analyzer_node">
-        <param name="use_descriptor_yolo_v4" value="false"/>
+        <param name="use_descriptor_yolo" value="false"/>
+        <param name="yolo_model" value="yolo_v7_tiny_coco"/>
         <param name="confidence_threshold" value="0.70"/>
         <param name="nms_threshold" value="0.5"/>
         <param name="person_probability_threshold" value="0.75"/>
@@ -42,5 +51,6 @@
     <node pkg="person_identification" type="capture_face_node.py" name="capture_face_node" output="screen">
         <param name="name" value="$(arg name)"/>
         <param name="mean_size" value="50"/>
+        <param name="face_sharpness_score_threshold" value="0.5"/>
     </node>
 </launch>
diff --git a/ros/perceptions/person_identification/scripts/capture_face_node.py b/ros/perceptions/person_identification/scripts/capture_face_node.py
@@ -18,22 +18,23 @@ class CaptureFaceNode:
     def __init__(self):
         self._name = rospy.get_param('~name')
         self._mean_size = rospy.get_param('~mean_size')
+        self._face_sharpness_score_threshold = rospy.get_param('~face_sharpness_score_threshold')
 
         self._descriptors_lock = threading.Lock()
         self._descriptors = []
         self._video_analysis_sub = rospy.Subscriber('video_analysis', VideoAnalysis, self._video_analysis_cb, queue_size=1)
 
     def _video_analysis_cb(self, msg):
-        face_descriptor = None
+        face_object = None
         for object in msg.objects:
-            if len(object.face_descriptor) > 0 and face_descriptor is not None:
+            if len(object.face_descriptor) > 0 and face_object is not None:
                 rospy.logwarn('Only one face must be present in the image.')
             elif len(object.face_descriptor) > 0:
-                face_descriptor = object.face_descriptor
+                face_object = object
 
-        if face_descriptor is not None:
+        if face_object is not None and face_object.face_sharpness_score >= self._face_sharpness_score_threshold:
             with self._descriptors_lock:
-                self._descriptors.append(face_descriptor)
+                self._descriptors.append(face_object.face_descriptor)
 
     def run(self):
         self.enable_video_analyzer()

diff --git a/ros/perceptions/person_identification/scripts/person_identification_node.py b/ros/perceptions/person_identification/scripts/person_identification_node.py
@@ -41,6 +41,7 @@ def __init__(self, descriptor, direction):
 
 class PersonIdentificationNode:
     def __init__(self):
+        self._face_sharpness_score_threshold = rospy.get_param('~face_sharpness_score_threshold')
         self._face_descriptor_threshold = rospy.get_param('~face_descriptor_threshold')
         self._voice_descriptor_threshold = rospy.get_param('~voice_descriptor_threshold')
         self._face_voice_descriptor_threshold = rospy.get_param('~face_voice_descriptor_threshold')
@@ -83,7 +84,8 @@ def _video_analysis_cb(self, msg):
             for object in msg.objects:
                 if len(object.face_descriptor) == 0 or len(object.person_pose_2d) == 0 or len(object.person_pose_3d) == 0 \
                         or len(object.person_pose_confidence) == 0 \
-                        or object.person_pose_confidence[PERSON_POSE_NOSE_INDEX] < self._nose_confidence_threshold:
+                        or object.person_pose_confidence[PERSON_POSE_NOSE_INDEX] < self._nose_confidence_threshold \
+                        or object.face_sharpness_score < self._face_sharpness_score_threshold:
                     continue
 
                 position_2d = object.person_pose_2d[PERSON_POSE_NOSE_INDEX]

diff --git a/ros/perceptions/video_analyzer/README.md b/ros/perceptions/video_analyzer/README.md
@@ -7,7 +7,9 @@ This node detects objects. If the audio contains a person, it estimates the pose
 This node uses RGB images, so the 3D positions are not set.
 
 #### Parameters
- - `use_descriptor_yolo_v4` (bool): Indicates to use the network extracting a object embedding or not.
+ - `use_descriptor_yolo` (bool): Indicates to use the network extracting an object embedding or not.
+ - `yolo_model` (string): If descriptor_yolo_v4 is not used, it indicates which model to use for YOLO (yolo_v4_coco, yolo_v4_tiny_coco, yolo_v7_coco, yolo_v7_tiny_coco or yolo_v7_objects365).
+ If descriptor_yolo is used, it indicates which model to use for Descriptor-YOLO (yolo_v4_tiny_coco, yolo_v7_coco or yolo_v7_objects365).
  - `confidence_threshold` (double): The object confidence threshold.
  - `nms_threshold` (double): The Non-Maximum Suppresion threshold.
  - `person_probability_threshold` (double): The person confidence threshold.
@@ -36,7 +38,9 @@ This node detects objects. If the audio contains a person, it estimates the pose
 This node uses RGB-D images, so the 3D positions are set.
 
 #### Parameters
- - `use_descriptor_yolo_v4` (bool): Indicates to use the network extracting a object embedding or not.
+ - `use_descriptor_yolo` (bool): Indicates to use the network extracting an object embedding or not.
+ - `yolo_model` (string): If descriptor_yolo_v4 is not used, it indicates which model to use for YOLO (yolo_v4_coco, yolo_v4_tiny_coco, yolo_v7_coco, yolo_v7_tiny_coco or yolo_v7_objects365).
+ If descriptor_yolo is used, it indicates which model to use for Descriptor-YOLO (yolo_v4_tiny_coco, yolo_v7_coco or yolo_v7_objects365).
  - `confidence_threshold` (double): The object confidence threshold.
  - `nms_threshold` (double): The Non-Maximum Suppresion threshold.
  - `person_probability_threshold` (double): The person confidence threshold.

diff --git a/ros/perceptions/video_analyzer/msg/VideoAnalysisObject.msg b/ros/perceptions/video_analyzer/msg/VideoAnalysisObject.msg
@@ -6,6 +6,7 @@ float32 height_2d # Normalized
 
 string object_class
 float32 object_confidence
+float32 object_class_probability
 float32[] object_descriptor
 sensor_msgs/Image object_image
 
@@ -15,4 +16,6 @@ float32[] person_pose_confidence
 sensor_msgs/Image person_pose_image
 
 float32[] face_descriptor
+int32 face_alignment_keypoint_count
+float32 face_sharpness_score
 sensor_msgs/Image face_image
diff --git a/ros/perceptions/video_analyzer/scripts/video_analyzer_2d_node.py b/ros/perceptions/video_analyzer/scripts/video_analyzer_2d_node.py
@@ -52,6 +52,7 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width):
         o.height_2d = object_analysis.height / image_height
         o.object_class = object_analysis.object_class
         o.object_confidence = object_analysis.object_confidence
+        o.object_class_probability = object_analysis.object_class_probability
         if object_analysis.object_image is not None:
             o.object_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.object_image, encoding='rgb8')
         o.object_descriptor = object_analysis.object_descriptor
@@ -65,6 +66,8 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width):
 
         if object_analysis.face_analysis is not None:
             o.face_descriptor = object_analysis.face_analysis.descriptor
+            o.face_alignment_keypoint_count = object_analysis.face_analysis.alignment_keypoint_count
+            o.face_sharpness_score = object_analysis.face_analysis.sharpness_score
             if object_analysis.face_analysis.face_image is not None:
                 o.face_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.face_analysis.face_image, encoding='rgb8')
 

diff --git a/ros/perceptions/video_analyzer/scripts/video_analyzer_3d_node.py b/ros/perceptions/video_analyzer/scripts/video_analyzer_3d_node.py
@@ -73,6 +73,7 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width, de
         o.height_2d = object_analysis.height / image_height
         o.object_class = object_analysis.object_class
         o.object_confidence = object_analysis.object_confidence
+        o.object_class_probability = object_analysis.object_class_probability
         if object_analysis.object_image is not None:
             o.object_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.object_image, encoding='rgb8')
         o.object_descriptor = object_analysis.object_descriptor
@@ -89,6 +90,8 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width, de
 
         if object_analysis.face_analysis is not None:
             o.face_descriptor = object_analysis.face_analysis.descriptor
+            o.face_alignment_keypoint_count = object_analysis.face_analysis.alignment_keypoint_count
+            o.face_sharpness_score = object_analysis.face_analysis.sharpness_score
             if object_analysis.face_analysis.face_image is not None:
                 o.face_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.face_analysis.face_image, encoding='rgb8')