Skip to content

Commit

Permalink
Improve dnn (#74)
Browse files Browse the repository at this point in the history
* Update pose_estimor and face_descriptor_extractor

* Fix heatmap_sigma type

* Improve face alignment, fix face_descriptor extractor training.

* Improve audio analyzer (ref #70).

* Improve video analyzer (ref #68)

* Update HBBA lite.

* Update video analysis and audio analysis loggers.

* Add HBBA strategies log to the perception logger.

* Rename blur score to sharpness score.

* Add distillation_loss_alpha to train_pose_estimator.py

* Add distillation_loss_alpha to output path.

* Fix trainers for criterion next_epoch. Fix export_yolo_v4. Add GPU support to align_faces.py and test_pose_estimator_with_yolo_v4.py.

* Fix OpenFaceBackbone

* Add yolov7 module.

* Reformat code

* Add yolov7 module.

* Add missing eval in test_converted_yolo.

* Fix YOLO V7. Add the choice of the yolo model in video_analyzer.

* Add AudioSet support. Add MS-celeb-support.

* Add margin to the output path.

* Fix dnns

* Set the odas frame in the midlle of the mics.

* Use the face_sharpness_score in person_identification. Fix test launch files.

* Update video_analyzer dnns export.

* Add a model choice for FaceDescriptorExtractor.

* Fix dnn_units tests.

* Fix the realsense in capture_face.launch

* Add PSLA pooling. Add backends.

* Add mixup and VIT to backbone training.

* Add missing changes for the backbone training.

* Add passt_s_n models to the audio trainings.

* Use the old networks for the face description.

* Refactoring for descriptor_yolo_v7 and yolov7 trained on Objects365.

* Fix test_face_descriptor_extractor

* Fix descriptor Yolov7

* Fix descriptor yolov7

* Fix YOLOV7 trained on Objects365.

* Fix video analyzer and export.

* Fix issues.

* Fix dnn_utils tests.

* Fix dnn_utils tests.
  • Loading branch information
mamaheux committed Nov 15, 2023
1 parent 5571627 commit ae7d1f9
Show file tree
Hide file tree
Showing 148 changed files with 5,766 additions and 1,477 deletions.
Binary file modified documentation/tf/odas.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 5 additions & 5 deletions documentation/tf/tf.drawio
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<mxfile host="app.diagrams.net" modified="2023-05-18T23:44:43.061Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0" etag="lR1TwZr0HA5rjEsmMCSQ" version="21.3.2" type="device">
<mxfile host="app.diagrams.net" modified="2023-09-07T15:29:35.502Z" agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0" etag="UDr_84b8QlR3HeeEyIeO" version="21.6.8" type="device">
<diagram name="Page-1" id="3laUdoB3Sh00mUV5PLZu">
<mxGraphModel dx="1562" dy="854" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<mxGraphModel dx="1185" dy="675" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
Expand Down Expand Up @@ -264,12 +264,12 @@
</mxCell>
<mxCell id="4EcZqvmrcj0V483apFQy-74" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#dae8fc;strokeColor=#0000FF;strokeWidth=3;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="1395.68" y="723" as="sourcePoint" />
<mxPoint x="1395.68" y="663" as="targetPoint" />
<mxPoint x="1391.68" y="936" as="sourcePoint" />
<mxPoint x="1391.68" y="876" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="4EcZqvmrcj0V483apFQy-75" value="" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fillColor=#000000;" parent="1" vertex="1">
<mxGeometry x="1391.02" y="718" width="10" height="10" as="geometry" />
<mxGeometry x="1387.02" y="931" width="10" height="10" as="geometry" />
</mxCell>
<mxCell id="4EcZqvmrcj0V483apFQy-76" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#f8cecc;strokeColor=#FF0000;strokeWidth=3;" parent="1" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
Expand Down
4 changes: 3 additions & 1 deletion ros/demos/control_panel/src/control_panel_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <hbba_lite/core/DesireSet.h>
#include <hbba_lite/core/RosFilterPool.h>
#include <hbba_lite/core/GecodeSolver.h>
#include <hbba_lite/core/RosStrategyStateLogger.h>
#include <hbba_lite/core/HbbaLite.h>

#include <t_top_hbba_lite/Strategies.h>
Expand Down Expand Up @@ -53,7 +54,8 @@ int startNode(int argc, char* argv[])
}

auto solver = make_unique<GecodeSolver>();
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));

QApplication application(argc, argv);
ControlPanel controlPanel(nodeHandle, desireSet, camera2dWideEnabled);
Expand Down
4 changes: 3 additions & 1 deletion ros/demos/home_logger/src/home_logger_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <hbba_lite/core/RosFilterPool.h>
#include <hbba_lite/core/GecodeSolver.h>
#include <hbba_lite/core/HbbaLite.h>
#include <hbba_lite/core/RosStrategyStateLogger.h>

#include <t_top_hbba_lite/Strategies.h>

Expand Down Expand Up @@ -93,7 +94,8 @@ void startNode(
strategies.emplace_back(createPlaySoundStrategy(filterPool, desireSet, nodeHandle));

auto solver = make_unique<GecodeSolver>();
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));

VolumeManager volumeManager(nodeHandle);
SQLite::Database database(databasePath, SQLite::OPEN_READWRITE | SQLite::OPEN_CREATE);
Expand Down
4 changes: 3 additions & 1 deletion ros/demos/smart_speaker/src/smart_speaker_rss_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <hbba_lite/core/RosFilterPool.h>
#include <hbba_lite/core/GecodeSolver.h>
#include <hbba_lite/core/HbbaLite.h>
#include <hbba_lite/core/RosStrategyStateLogger.h>

#include <t_top_hbba_lite/Strategies.h>

Expand Down Expand Up @@ -59,7 +60,8 @@ void startNode(
strategies.emplace_back(createPlaySoundStrategy(filterPool, desireSet, nodeHandle));

auto solver = make_unique<GecodeSolver>();
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));

StateManager stateManager;
type_index idleStateType(typeid(RssIdleState));
Expand Down
4 changes: 3 additions & 1 deletion ros/demos/smart_speaker/src/smart_speaker_smart_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <hbba_lite/core/RosFilterPool.h>
#include <hbba_lite/core/GecodeSolver.h>
#include <hbba_lite/core/HbbaLite.h>
#include <hbba_lite/core/RosStrategyStateLogger.h>

#include <t_top_hbba_lite/Strategies.h>

Expand Down Expand Up @@ -61,7 +62,8 @@ void startNode(
strategies.emplace_back(createPlaySoundStrategy(filterPool, desireSet, nodeHandle));

auto solver = make_unique<GecodeSolver>();
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver));
auto strategyStateLogger = make_unique<RosStrategyStateLogger>(nodeHandle);
HbbaLite hbba(desireSet, move(strategies), {{"motor", 1}, {"sound", 1}, {"led", 1}}, move(solver), move(strategyStateLogger));

StateManager stateManager;
type_index askOtherTaskStateType(typeid(SmartAskOtherTaskState));
Expand Down
1 change: 1 addition & 0 deletions ros/perceptions/audio_analyzer/msg/AudioAnalysis.msg
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
std_msgs/Header header

int64 tracking_id
audio_utils/AudioFrame audio_frame

string[] audio_classes
Expand Down
15 changes: 9 additions & 6 deletions ros/perceptions/audio_analyzer/scripts/audio_analyzer_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _audio_cb(self, msg):
self._audio_analysis_count += audio_frame.shape[0]

def _analyse(self):
audio_buffer = self._get_audio_buffer()
audio_buffer, sst_id = self._get_audio_buffer_and_sst_id()
audio_descriptor_buffer = audio_buffer[-self._audio_descriptor_extractor.get_supported_duration():]
audio_descriptor, audio_class_probabilities = self._audio_descriptor_extractor(audio_descriptor_buffer)
audio_descriptor = audio_descriptor.tolist()
Expand All @@ -99,21 +99,22 @@ def _analyse(self):
voice_descriptor = []

audio_classes = self._get_audio_classes(audio_class_probabilities)
self._publish_audio_analysis(audio_buffer, audio_classes, audio_descriptor, voice_descriptor)
self._publish_audio_analysis(sst_id, audio_buffer, audio_classes, audio_descriptor, voice_descriptor)

def _get_audio_buffer(self):
def _get_audio_buffer_and_sst_id(self):
with self._audio_frames_lock:
sst_id = self._sst_id
audio_buffer = torch.cat(self._audio_frames, dim=0)
if audio_buffer.size()[0] < self._audio_buffer_duration:
return torch.cat([torch.zeros(self._audio_buffer_duration - audio_buffer.size()[0]), audio_buffer], dim=0)
return torch.cat([torch.zeros(self._audio_buffer_duration - audio_buffer.size()[0]), audio_buffer], dim=0), sst_id
else:
return audio_buffer[-self._audio_buffer_duration:]
return audio_buffer[-self._audio_buffer_duration:], sst_id

def _get_audio_classes(self, audio_class_probabilities):
return [self._class_names[i] for i in range(len(self._class_names))
if audio_class_probabilities[i].item() >= self._class_probability_threshold]

def _publish_audio_analysis(self, audio_buffer, audio_classes, audio_descriptor, voice_descriptor):
def _publish_audio_analysis(self, sst_id, audio_buffer, audio_classes, audio_descriptor, voice_descriptor):
with self._audio_direction_lock:
frame_id, direction_x, direction_y, direction_z = self._audio_direction

Expand All @@ -122,6 +123,8 @@ def _publish_audio_analysis(self, audio_buffer, audio_classes, audio_descriptor,
msg.header.stamp = rospy.Time.now()
msg.header.frame_id = frame_id

msg.tracking_id = sst_id

msg.audio_frame.format = 'float'
msg.audio_frame.channel_count = SUPPORTED_CHANNEL_COUNT
msg.audio_frame.sampling_frequency = self._supported_sampling_frequency
Expand Down
2 changes: 2 additions & 0 deletions ros/perceptions/person_identification/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ roslaunch person_identification capture_face.launch name:=<person_name> neural_n
#### Parameters
- `name` (string): The person name.
- `mean_size` (int): How many descriptor to average.
- `face_sharpness_score_threshold` (double): The threshold to consider the face sharp enough.

#### Subscribed Topics
- `video_analysis` ([video_analyzer/VideoAnalysis](../video_analyzer/msg/VideoAnalysis.msg)): The video analysis containing the detected objects.
Expand All @@ -38,6 +39,7 @@ roslaunch person_identification capture_voice.launch name:=<person_name> neural_
This node performs person identification. The people must be already added to `people.json` with the previous nodes.

#### Parameters
- `face_sharpness_score_threshold` (double): The threshold to consider the face sharp enough.
- `face_descriptor_threshold` (double): The maximum distance between two face descriptors to be considered the same person.
- `voice_descriptor_threshold` (double): The maximum distance between two voice descriptors to be considered the same person.
- `face_voice_descriptor_threshold` (double): The maximum distance between two merged descriptors to be considered the same person.
Expand Down
26 changes: 18 additions & 8 deletions ros/perceptions/person_identification/launch/capture_face.launch
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,32 @@
<arg name="name"/>
<arg name="neural_network_inference_type"/> <!-- Options: cpu, torch_gpu or trt_gpu -->

<include file="$(find realsense2_camera)/launch/rs_camera.launch">
<include file="$(find realsense2_camera)/launch/rs_aligned_depth.launch">
<arg name="fisheye_width" value="1280"/>
<arg name="fisheye_height" value="720"/>

<arg name="depth_width" value="1280"/>
<arg name="depth_height" value="720"/>

<arg name="infra_width" value="1280"/>
<arg name="infra_height" value="720"/>

<arg name="color_width" value="1280"/>
<arg name="color_height" value="720"/>
<arg name="align_depth" value="true"/>

<arg name="color_fps" value="30"/>
<arg name="infra_fps" value="30"/>
<arg name="depth_fps" value="30"/>

<arg name="filters" value="pointcloud"/>
<arg name="color_fps" value="15"/>
<arg name="infra_fps" value="15"/>
<arg name="depth_fps" value="15"/>

<arg name="enable_gyro" value="false"/>
<arg name="enable_accel" value="false"/>

<arg name="camera" value="camera_3d"/>
</include>

<node pkg="video_analyzer" type="video_analyzer_3d_node.py" name="video_analyzer_node">
<param name="use_descriptor_yolo_v4" value="false"/>
<param name="use_descriptor_yolo" value="false"/>
<param name="yolo_model" value="yolo_v7_tiny_coco"/>
<param name="confidence_threshold" value="0.70"/>
<param name="nms_threshold" value="0.5"/>
<param name="person_probability_threshold" value="0.75"/>
Expand All @@ -42,5 +51,6 @@
<node pkg="person_identification" type="capture_face_node.py" name="capture_face_node" output="screen">
<param name="name" value="$(arg name)"/>
<param name="mean_size" value="50"/>
<param name="face_sharpness_score_threshold" value="0.5"/>
</node>
</launch>
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,23 @@ class CaptureFaceNode:
def __init__(self):
self._name = rospy.get_param('~name')
self._mean_size = rospy.get_param('~mean_size')
self._face_sharpness_score_threshold = rospy.get_param('~face_sharpness_score_threshold')

self._descriptors_lock = threading.Lock()
self._descriptors = []
self._video_analysis_sub = rospy.Subscriber('video_analysis', VideoAnalysis, self._video_analysis_cb, queue_size=1)

def _video_analysis_cb(self, msg):
face_descriptor = None
face_object = None
for object in msg.objects:
if len(object.face_descriptor) > 0 and face_descriptor is not None:
if len(object.face_descriptor) > 0 and face_object is not None:
rospy.logwarn('Only one face must be present in the image.')
elif len(object.face_descriptor) > 0:
face_descriptor = object.face_descriptor
face_object = object

if face_descriptor is not None:
if face_object is not None and face_object.face_sharpness_score >= self._face_sharpness_score_threshold:
with self._descriptors_lock:
self._descriptors.append(face_descriptor)
self._descriptors.append(face_object.face_descriptor)

def run(self):
self.enable_video_analyzer()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, descriptor, direction):

class PersonIdentificationNode:
def __init__(self):
self._face_sharpness_score_threshold = rospy.get_param('~face_sharpness_score_threshold')
self._face_descriptor_threshold = rospy.get_param('~face_descriptor_threshold')
self._voice_descriptor_threshold = rospy.get_param('~voice_descriptor_threshold')
self._face_voice_descriptor_threshold = rospy.get_param('~face_voice_descriptor_threshold')
Expand Down Expand Up @@ -83,7 +84,8 @@ def _video_analysis_cb(self, msg):
for object in msg.objects:
if len(object.face_descriptor) == 0 or len(object.person_pose_2d) == 0 or len(object.person_pose_3d) == 0 \
or len(object.person_pose_confidence) == 0 \
or object.person_pose_confidence[PERSON_POSE_NOSE_INDEX] < self._nose_confidence_threshold:
or object.person_pose_confidence[PERSON_POSE_NOSE_INDEX] < self._nose_confidence_threshold \
or object.face_sharpness_score < self._face_sharpness_score_threshold:
continue

position_2d = object.person_pose_2d[PERSON_POSE_NOSE_INDEX]
Expand Down
8 changes: 6 additions & 2 deletions ros/perceptions/video_analyzer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ This node detects objects. If the audio contains a person, it estimates the pose
This node uses RGB images, so the 3D positions are not set.

#### Parameters
- `use_descriptor_yolo_v4` (bool): Indicates to use the network extracting a object embedding or not.
- `use_descriptor_yolo` (bool): Indicates to use the network extracting an object embedding or not.
- `yolo_model` (string): If descriptor_yolo_v4 is not used, it indicates which model to use for YOLO (yolo_v4_coco, yolo_v4_tiny_coco, yolo_v7_coco, yolo_v7_tiny_coco or yolo_v7_objects365).
If descriptor_yolo is used, it indicates which model to use for Descriptor-YOLO (yolo_v4_tiny_coco, yolo_v7_coco or yolo_v7_objects365).
- `confidence_threshold` (double): The object confidence threshold.
- `nms_threshold` (double): The Non-Maximum Suppresion threshold.
- `person_probability_threshold` (double): The person confidence threshold.
Expand Down Expand Up @@ -36,7 +38,9 @@ This node detects objects. If the audio contains a person, it estimates the pose
This node uses RGB-D images, so the 3D positions are set.

#### Parameters
- `use_descriptor_yolo_v4` (bool): Indicates to use the network extracting a object embedding or not.
- `use_descriptor_yolo` (bool): Indicates to use the network extracting an object embedding or not.
- `yolo_model` (string): If descriptor_yolo_v4 is not used, it indicates which model to use for YOLO (yolo_v4_coco, yolo_v4_tiny_coco, yolo_v7_coco, yolo_v7_tiny_coco or yolo_v7_objects365).
If descriptor_yolo is used, it indicates which model to use for Descriptor-YOLO (yolo_v4_tiny_coco, yolo_v7_coco or yolo_v7_objects365).
- `confidence_threshold` (double): The object confidence threshold.
- `nms_threshold` (double): The Non-Maximum Suppresion threshold.
- `person_probability_threshold` (double): The person confidence threshold.
Expand Down
3 changes: 3 additions & 0 deletions ros/perceptions/video_analyzer/msg/VideoAnalysisObject.msg
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ float32 height_2d # Normalized

string object_class
float32 object_confidence
float32 object_class_probability
float32[] object_descriptor
sensor_msgs/Image object_image

Expand All @@ -15,4 +16,6 @@ float32[] person_pose_confidence
sensor_msgs/Image person_pose_image

float32[] face_descriptor
int32 face_alignment_keypoint_count
float32 face_sharpness_score
sensor_msgs/Image face_image
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width):
o.height_2d = object_analysis.height / image_height
o.object_class = object_analysis.object_class
o.object_confidence = object_analysis.object_confidence
o.object_class_probability = object_analysis.object_class_probability
if object_analysis.object_image is not None:
o.object_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.object_image, encoding='rgb8')
o.object_descriptor = object_analysis.object_descriptor
Expand All @@ -65,6 +66,8 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width):

if object_analysis.face_analysis is not None:
o.face_descriptor = object_analysis.face_analysis.descriptor
o.face_alignment_keypoint_count = object_analysis.face_analysis.alignment_keypoint_count
o.face_sharpness_score = object_analysis.face_analysis.sharpness_score
if object_analysis.face_analysis.face_image is not None:
o.face_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.face_analysis.face_image, encoding='rgb8')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width, de
o.height_2d = object_analysis.height / image_height
o.object_class = object_analysis.object_class
o.object_confidence = object_analysis.object_confidence
o.object_class_probability = object_analysis.object_class_probability
if object_analysis.object_image is not None:
o.object_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.object_image, encoding='rgb8')
o.object_descriptor = object_analysis.object_descriptor
Expand All @@ -89,6 +90,8 @@ def _object_analysis_to_msg(self, object_analysis, image_height, image_width, de

if object_analysis.face_analysis is not None:
o.face_descriptor = object_analysis.face_analysis.descriptor
o.face_alignment_keypoint_count = object_analysis.face_analysis.alignment_keypoint_count
o.face_sharpness_score = object_analysis.face_analysis.sharpness_score
if object_analysis.face_analysis.face_image is not None:
o.face_image = self._cv_bridge.cv2_to_imgmsg(object_analysis.face_analysis.face_image, encoding='rgb8')

Expand Down
Loading

0 comments on commit ae7d1f9

Please sign in to comment.