Skip to content

Commit

Permalink
Add CABA tree to task_struct
Browse files Browse the repository at this point in the history
In linux after parent (father) process dies, children processes are
moved (reparented) to a reaper process. Roughly speaking:

1) If father has other yet alive thread, this thread would be a reaper.

2) Else if there is father's ancestor (with no pidns level change in the
middle), which has PR_SET_CHILD_SUBREAPER set, this ancestor would be a
reaper.

3) Else father's pidns init would be a reaper for fathers children.

The problem with this for CRIU is that when CRIU comes to dump processes
it does not know the order in which processes and their resources were
created. And processes can have resources which a) can only be inherited
when we clone processes, b) can only be created by specific processes
and c) are shared between several processes (the example of such a
resource is process session). For such resources CRIU restore would need
to re-invent such order of process creation which at the same time
creates the desired process tree topology and allows to inherit all
resources right.

When process reparenting involves child-sub-reapers one can drastically
mix processes in process tree so that it is not obvious how to restore
everything right.

So this is what we came up with to help CRIU to overcome this problem:

CABA = Closest Alive Born Ancestor
CABD = Closest Alive Born Descendant

We want to put processes in one more tree - CABA tree. This tree is not
affecting reparenting or process creation in any way except for
providing a new information to CRIU so that we can understand from where
the reparented child had reparented, though original father is already
dead and probably a fathers father too, we can still have information
about the process which is still alive and was originally a parent of
process sequence (of already dead processes) which lead to us - CABA.

CC: Eric Biederman <ebiederm@xmission.com>
CC: Kees Cook <keescook@chromium.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Ingo Molnar <mingo@redhat.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Juri Lelli <juri.lelli@redhat.com>
CC: Vincent Guittot <vincent.guittot@linaro.org>
CC: Dietmar Eggemann <dietmar.eggemann@arm.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Segall <bsegall@google.com>
CC: Mel Gorman <mgorman@suse.de>
CC: Daniel Bristot de Oliveira <bristot@redhat.com>
CC: Valentin Schneider <vschneid@redhat.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: linux-ia64@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux-mm@kvack.org
CC: linux-fsdevel@vger.kernel.org
CC: kernel@openvz.org

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>

--
v2: fix unused variables reported-by: kernel test robot <lkp@intel.com>
v3: - on fork set caba to current, so that caba is always a process
      which had initiated our creation, even for CLONE_PARENT case
    - move caba update to a later stage (to __unhash_process), so that
      zombies can be somebodies caba until fully released
  • Loading branch information
Snorch authored and intel-lab-lkp committed Sep 8, 2022
1 parent 144eeb2 commit 17a897a
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 0 deletions.
3 changes: 3 additions & 0 deletions arch/ia64/kernel/mca.c
Original file line number Diff line number Diff line change
Expand Up @@ -1793,6 +1793,9 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
p->parent = p->real_parent = p->group_leader = p;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
p->caba = p;
INIT_LIST_HEAD(&p->cabds);
INIT_LIST_HEAD(&p->cabd);
strncpy(p->comm, type, sizeof(p->comm)-1);
}

Expand Down
1 change: 1 addition & 0 deletions fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,7 @@ static int de_thread(struct task_struct *tsk)

list_replace_rcu(&leader->tasks, &tsk->tasks);
list_replace_init(&leader->sibling, &tsk->sibling);
list_replace_init(&leader->cabd, &tsk->cabd);

tsk->group_leader = tsk;
leader->group_leader = tsk;
Expand Down
20 changes: 20 additions & 0 deletions fs/proc/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,28 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
const struct cred *cred;
pid_t ppid, tpid = 0, tgid, ngid;
unsigned int max_fds = 0;
#ifdef CONFIG_PID_NS
struct task_struct *caba;
struct pid *caba_pid;
int caba_level = 0;
pid_t caba_pids[MAX_PID_NS_LEVEL] = {};
#endif

rcu_read_lock();
ppid = pid_alive(p) ?
task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;

#ifdef CONFIG_PID_NS
caba = rcu_dereference(p->caba);
caba_pid = get_task_pid(caba, PIDTYPE_PID);
if (caba_pid) {
caba_level = caba_pid->level;
for (g = ns->level; g <= caba_level; g++)
caba_pids[g] = task_pid_nr_ns(caba, caba_pid->numbers[g].ns);
put_pid(caba_pid);
}
#endif

tracer = ptrace_parent(p);
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
Expand Down Expand Up @@ -217,6 +234,9 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
seq_puts(m, "\nNSsid:");
for (g = ns->level; g <= pid->level; g++)
seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
seq_puts(m, "\nNScaba:");
for (g = ns->level; g <= caba_level; g++)
seq_put_decimal_ull(m, "\t", caba_pids[g]);
#endif
seq_putc(m, '\n');
}
Expand Down
7 changes: 7 additions & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,13 @@ struct task_struct {
struct list_head sibling;
struct task_struct *group_leader;

/* Closest Alive Born Ancestor process: */
struct task_struct __rcu *caba;

/* Closest Alive Born Descendants list: */
struct list_head cabds;
struct list_head cabd;

/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
Expand Down
3 changes: 3 additions & 0 deletions init/init_task.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ struct task_struct init_task
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
.caba = &init_task,
.cabds = LIST_HEAD_INIT(init_task.cabds),
.cabd = LIST_HEAD_INIT(init_task.cabd),
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
Expand Down
21 changes: 21 additions & 0 deletions kernel/exit.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,29 @@
#include <asm/unistd.h>
#include <asm/mmu_context.h>

static void forget_caba(struct task_struct *caba)
{
struct task_struct *p, *t, *new_caba;

if (list_empty(&caba->cabds))
return;

if (!thread_group_leader(caba))
new_caba = caba->group_leader;
else
new_caba = caba->caba;

list_for_each_entry(p, &caba->cabds, cabd) {
for_each_thread(p, t)
RCU_INIT_POINTER(t->caba, new_caba);
}
list_splice_tail_init(&caba->cabds, &new_caba->cabds);
}

static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
forget_caba(p);
detach_pid(p, PIDTYPE_PID);
if (group_dead) {
detach_pid(p, PIDTYPE_TGID);
Expand All @@ -82,6 +102,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)

list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
list_del_init(&p->cabd);
__this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_group);
Expand Down
4 changes: 4 additions & 0 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -2139,6 +2139,8 @@ static __latent_entropy struct task_struct *copy_process(
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
INIT_LIST_HEAD(&p->cabds);
INIT_LIST_HEAD(&p->cabd);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
Expand Down Expand Up @@ -2402,6 +2404,7 @@ static __latent_entropy struct task_struct *copy_process(
p->parent_exec_id = current->self_exec_id;
p->exit_signal = args->exit_signal;
}
p->caba = current;

klp_copy_process(p);

Expand Down Expand Up @@ -2455,6 +2458,7 @@ static __latent_entropy struct task_struct *copy_process(
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail(&p->cabd, &p->caba->cabds);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
Expand Down

0 comments on commit 17a897a

Please sign in to comment.