Skip to content

Commit

Permalink
proc: Add a new isolated /proc/pid/mempolicy type.
Browse files Browse the repository at this point in the history
/proc/pid/mempolicy can be used to check and adjust the userspace task's
mempolicy dynamically.In many case, the application and the control plane
are two separate systems. When the application is created, it doesn't know
how to use memory, and it doesn't care. The control plane will decide the
memory usage policy based on different reasons.In that case, we can
dynamically adjust the mempolicy using /proc/pid/mempolicy interface.

Format of input:
----------------
<mode>[=<flags>][:<nodelist>]

Example
-------
set mempolicy:
 $ echo "interleave=static:0-3" > /proc/27036/mempolicy
 $ cat /proc/27036/mempolicy
 interleave=static:0-3
remove mempolicy:
+  $ echo "default" > /proc/27036/mempolicy

The following 6 mempolicy mode types:
"default" "prefer"  "bind" "interleave" "local" "prefer (many)"

The supported mode flags are:
"static" "relative"

nodelist         For example:0-3 or 0,1,2,3

Signed-off-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
  • Loading branch information
Zhongkun He authored and intel-lab-lkp committed Sep 26, 2022
1 parent 0cffc98 commit ac5c3a8
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 7 deletions.
40 changes: 40 additions & 0 deletions Documentation/filesystems/proc.rst
Expand Up @@ -47,6 +47,8 @@ fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
3.10 /proc/<pid>/timerslack_ns - Task timerslack value
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
3.12 /proc/<pid>/arch_status - Task architecture specific information
3.13 /proc/<pid>/mempolicy & /proc/<pid>/task/<tid>/mempolicy- Adjust
the mempolicy
4 Configuring procfs
4.1 Mount options
Expand Down Expand Up @@ -2145,6 +2147,44 @@ AVX512_elapsed_ms
the task is unlikely an AVX512 user, but depends on the workload and the
scheduling scenario, it also could be a false negative mentioned above.

3.13 /proc/<pid>/mempolicy & /proc/<pid>/task/<tid>/mempolicy- Adjust the mempolicy
-----------------------------------------------------------------------------------
When CONFIG_NUMA is enabled, these files can be used to check and adjust the current
mempolicy.Please note that the effectively <pid>,<tid> is from userspace programs.

Format of input:
----------------
<mode>[=<flags>][:<nodelist>]

Example
-------
set mempolicy:
$ echo "interleave=static:0-3" > /proc/27036/mempolicy
$ cat /proc/27036/mempolicy
interleave=static:0-3

remove mempolicy:
$ echo "default" > /proc/27036/mempolicy

The following 6 mempolicy mode types are supported:
"default" Default is converted to the NULL memory policy, any existing non-default policy
will simply be removed when "default" is specified.
"prefer" The allocation should be attempted from the single node specified in the policy.
"bind" Memory must come from the set of nodes specified by the policy.
"interleave" Page allocations be interleaved across the nodes specified in the policy.
"local" The memory is allocated on the node of the CPU that triggered the allocation.
"prefer (many)" The allocation should be preferrably satisfied from the nodemask specified in the policy.

The supported mode flags are:

"static" A nonempty nodemask specifies physical node IDs.
"relative" A nonempty nodemask specifies node IDs that are relative
to the set of node IDs allowed by the thread's current cpuset.

nodelist For example: 0-3 or 0,1,2,3

Please see: Documentation/admin-guide/mm/numa_memory_policy.rst for descriptions of memory policy.

Chapter 4: Configuring procfs
=============================

Expand Down
2 changes: 2 additions & 0 deletions fs/proc/base.c
Expand Up @@ -3268,6 +3268,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
REG("mempolicy", S_IRUGO|S_IWUSR, proc_mempolicy_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
Expand Down Expand Up @@ -3617,6 +3618,7 @@ static const struct pid_entry tid_base_stuff[] = {
#endif
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
REG("mempolicy", S_IRUGO|S_IWUSR, proc_mempolicy_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
Expand Down
1 change: 1 addition & 0 deletions fs/proc/internal.h
Expand Up @@ -305,6 +305,7 @@ extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_mempolicy_operations;

extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
Expand Down
129 changes: 129 additions & 0 deletions fs/proc/task_mmu.c
Expand Up @@ -2025,4 +2025,133 @@ const struct file_operations proc_pid_numa_maps_operations = {
.release = proc_map_release,
};

#define MPOLBUFLEN 64
/*
*Display task's memory policy via /proc./
*/
static ssize_t mempolicy_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task = get_proc_task(file_inode(file));
char buffer[MPOLBUFLEN];
struct mempolicy *mpol;
size_t len = 0;

if (!task)
return -ESRCH;

task_lock(task);
mpol = task->mempolicy;
mpol_get(mpol);
task_unlock(task);

if (!mpol || mpol->mode == MPOL_DEFAULT)
goto out;

memset(buffer, 0, sizeof(buffer));
mpol_to_str(buffer, sizeof(buffer), mpol);
buffer[strlen(buffer)] = '\n';
len = simple_read_from_buffer(buf, count, ppos, buffer, strlen(buffer));

out:
mpol_put(mpol);
put_task_struct(task);
return len;
}

/*
*Update nodemask of mempolicy according to task->mems_allowed.
*/
static int update_task_mpol(struct task_struct *task, struct mempolicy *mpol)
{
nodemask_t tsk_allowed;
struct mempolicy *old = NULL;
int err = 0;

task_lock(task);
local_irq_disable();
old = task->mempolicy;

if (mpol)
nodes_and(tsk_allowed, task->mems_allowed, mpol->w.user_nodemask);
else
nodes_clear(tsk_allowed);

if (!nodes_empty(tsk_allowed)) {
task->mempolicy = mpol;
mpol_rebind_task(task, &tsk_allowed);
} else if (!mpol || mpol->mode == MPOL_LOCAL) {
/*default (pol==NULL), clear the old mpol;
*local memory policies are not a subject of any remapping.
*/
task->mempolicy = mpol;
} else {
/*tsk_allowed is empty.*/
err = -EINVAL;
}

if (!err && mpol && mpol->mode == MPOL_INTERLEAVE)
task->il_prev = MAX_NUMNODES-1;

local_irq_enable();
task_unlock(task);

/*If successful, release old policy,
* otherwise keep old and release mpol.
*/
if (err)
mpol_put(mpol);
else
mpol_put(old);

return err;
}

/*
*Modify task's memory policy via /proc.
*/
static ssize_t mempolicy_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
char buffer[MPOLBUFLEN];
struct mempolicy *mpol = NULL;
struct task_struct *task;
int err = 0;

task = get_proc_task(file_inode(file));

if (!task)
return -ESRCH;

/*we can only change the user's mempolicy*/
if (task->flags & PF_KTHREAD || is_global_init(task)) {
err = -EPERM;
goto out;
}

memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count)) {
err = -EFAULT;
goto out;
}

err = mpol_parse_str(strstrip(buffer), &mpol);
if (err) {
err = -EINVAL;
goto out;
}
err = update_task_mpol(task, mpol);
out:
put_task_struct(task);
return err < 0 ? err : count;
}

const struct file_operations proc_mempolicy_operations = {
.read = mempolicy_read,
.write = mempolicy_write,
.llseek = default_llseek,
};

#endif /* CONFIG_NUMA */
5 changes: 0 additions & 5 deletions include/linux/mempolicy.h
Expand Up @@ -165,10 +165,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
Expand Down Expand Up @@ -271,12 +268,10 @@ static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
return 1; /* error */
}
#endif

static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
unsigned long address)
Expand Down
2 changes: 0 additions & 2 deletions mm/mempolicy.c
Expand Up @@ -2968,7 +2968,6 @@ static const char * const policy_modes[] =
};


#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
Expand Down Expand Up @@ -3101,7 +3100,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
*mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */

/**
* mpol_to_str - format a mempolicy structure for printing
Expand Down

0 comments on commit ac5c3a8

Please sign in to comment.