Skip to content

Commit

Permalink
agent: Fix an issue reporting OOM events by mistake
Browse files Browse the repository at this point in the history
The agent registers an event fd in `memory.oom_control`. An OOM event is
forwarded to containerd when the event is emitted, regardless of the
content in that file.

I observed content indicating that events should not be forwarded, as shown
below. When `oom_kill` is set to 0, it means no OOM has occurred. Therefore,
it is important to check the content to avoid mistakenly forwarding OOM
events.

```
oom_kill_disable 0
under_oom 0
oom_kill 0
```

Fixes: #8715

Signed-off-by: Xuewei Niu <niuxuewei.nxw@antgroup.com>
  • Loading branch information
justxuewei committed Jan 4, 2024
1 parent 0f53217 commit 7e1aeab
Showing 1 changed file with 38 additions and 10 deletions.
48 changes: 38 additions & 10 deletions src/agent/rustjail/src/cgroups/notifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
// SPDX-License-Identifier: Apache-2.0
//

use anyhow::{anyhow, Context, Result};
use eventfd::{eventfd, EfdFlags};
use nix::sys::eventfd;
use std::fs::{self, File};
use std::os::unix::io::{AsRawFd, FromRawFd};
use std::path::Path;

use crate::pipestream::PipeStream;
use anyhow::{anyhow, Context, Result};
use eventfd::{eventfd, EfdFlags};
use futures::StreamExt as _;
use inotify::{Inotify, WatchMask};
use nix::sys::eventfd;
use tokio::io::AsyncReadExt;
use tokio::sync::mpsc::{channel, Receiver};

use crate::pipestream::PipeStream;

// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
Expand Down Expand Up @@ -165,22 +166,49 @@ async fn register_memory_event(

tokio::spawn(async move {
loop {
let sender = sender.clone();
let mut buf = [0u8; 8];
match eventfd_stream.read(&mut buf).await {
Err(err) => {
warn!(sl(), "failed to read from eventfd: {:?}", err);
return;
}
Ok(_) => {
let content = fs::read_to_string(path.clone());
// Read content from "memory.oom_control"
let content = match fs::read_to_string(path.clone()) {
Ok(oom_control) => oom_control,
Err(err) => {
warn!(sl(), "Ignoring an oom event as it is failed to read oom_control at {:?}, err: {:?}", path, err);
continue;
}
};
info!(
sl(),
"cgroup event for container: {}, path: {:?}, content: {:?}",
&containere_id,
&path,
content
"receive a oom_control event, container: {}, content: {}, path: {:?}",
containere_id,
content,
path
);

// Find a row containing "oom_kill ". Please note that `oom_kill_disable`
// might be matched if there is no space at the end of it.
if let Some(row) = content.split('\n').find(|row| row.contains("oom_kill ")) {
// Parse the row
match scan_fmt!(row, "oom_kill {d}", u32) {
Ok(times) => {
if times == 0 {
// No oom kill happened
continue;
}
}
Err(_) => {
error!(sl(), "Ingoring an oom event due to invalid format of the oom_kill row: {}", row);
continue;
}
};
}

// If no row that contains "oom_kill", send an oom event directly as
// previous implementation.
}
}

Expand Down

0 comments on commit 7e1aeab

Please sign in to comment.