From c36c940cd4aab393c09d457e2414423ac92bd339 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 9 Dec 2016 10:24:05 -0800 Subject: [PATCH 001/247] x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels commit 1c52d859cb2d417e7216d3e56bb7fea88444cec9 upstream. We support various non-Intel CPUs that don't have the CPUID instruction, so the M486 test was wrong. For now, fix it with a big hammer: handle missing CPUID on all 32-bit CPUs. Reported-by: One Thousand Gnomes Signed-off-by: Andy Lutomirski Cc: Juergen Gross Cc: Peter Zijlstra Cc: Brian Gerst Cc: Matthew Whitehead Cc: Borislav Petkov Cc: Henrique de Moraes Holschuh Cc: Andrew Cooper Cc: Boris Ostrovsky Cc: xen-devel Link: http://lkml.kernel.org/r/685bd083a7c036f7769510b6846315b17d6ba71f.1481307769.git.luto@kernel.org Signed-off-by: Thomas Gleixner Cc: "Zhang, Ning A" Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e40b19ca486eb..353f038ec6458 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -596,7 +596,7 @@ static inline void sync_core(void) { int tmp; -#ifdef CONFIG_M486 +#ifdef CONFIG_X86_32 /* * Do a CPUID if available, otherwise do a jump. The jump * can conveniently enough be the jump around CPUID. From fb39345e73141e4ae4529168965c37024cb3acb3 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 22 Jan 2018 15:44:51 -0500 Subject: [PATCH 002/247] orangefs: use list_for_each_entry_safe in purge_waiting_ops commit 0afc0decf247f65b7aba666a76a0a68adf4bc435 upstream. set_op_state_purged can delete the op. Signed-off-by: Martin Brandenburg Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/orangefs/waitqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index abcfa3fa99929..f61b00887481f 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -28,10 +28,10 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s */ void purge_waiting_ops(void) { - struct orangefs_kernel_op_s *op; + struct orangefs_kernel_op_s *op, *tmp; spin_lock(&orangefs_request_list_lock); - list_for_each_entry(op, &orangefs_request_list, list) { + list_for_each_entry_safe(op, tmp, &orangefs_request_list, list) { gossip_debug(GOSSIP_WAIT_DEBUG, "pvfs2-client-core: purging op tag %llu %s\n", llu(op->tag), From 5c26ee198fcacda157918a9b211ba1f111e1ed9b Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 22 Jan 2018 15:44:52 -0500 Subject: [PATCH 003/247] orangefs: initialize op on loop restart in orangefs_devreq_read commit a0ec1ded22e6a6bc41981fae22406835b006a66e upstream. In orangefs_devreq_read, there is a loop which picks an op off the list of pending ops. If the loop fails to find an op, there is nothing to read, and it returns EAGAIN. If the op has been given up on, the loop is restarted via a goto. The bug is that the variable which the found op is written to is not reinitialized, so if there are no more eligible ops on the list, the code runs again on the already handled op. This is triggered by interrupting a process while the op is being copied to the client-core. It's a fairly small window, but it's there. Signed-off-by: Martin Brandenburg Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/orangefs/devorangefs-req.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index fe2cbeb90772a..939aa066e1ca3 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -161,7 +161,7 @@ static ssize_t orangefs_devreq_read(struct file *file, struct orangefs_kernel_op_s *op, *temp; __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION; static __s32 magic = ORANGEFS_DEVREQ_MAGIC; - struct orangefs_kernel_op_s *cur_op = NULL; + struct orangefs_kernel_op_s *cur_op; unsigned long ret; /* We do not support blocking IO. */ @@ -181,6 +181,7 @@ static ssize_t orangefs_devreq_read(struct file *file, } restart: + cur_op = NULL; /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); list_for_each_entry_safe(op, temp, &orangefs_request_list, list) { From ce601a07bc504b4748f8e7a34896684f79514e51 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:49 -0700 Subject: [PATCH 004/247] usbip: prevent vhci_hcd driver from leaking a socket pointer address commit 2f2d0088eb93db5c649d2a5e34a3800a8a935fc5 upstream. When a client has a USB device attached over IP, the vhci_hcd driver is locally leaking a socket pointer address via the /sys/devices/platform/vhci_hcd/status file (world-readable) and in debug output when "usbip --debug port" is run. Fix it to not leak. The socket pointer address is not used at the moment and it was made visible as a convenient way to find IP address from socket pointer address by looking up /proc/net/{tcp,tcp6}. As this opens a security hole, the fix replaces socket pointer address with sockfd. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_common.h | 1 + drivers/usb/usbip/vhci_sysfs.c | 25 +++++++++++++++---------- tools/usb/usbip/libsrc/vhci_driver.c | 8 ++++---- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h index 9f490375ac923..f0b955f8504eb 100644 --- a/drivers/usb/usbip/usbip_common.h +++ b/drivers/usb/usbip/usbip_common.h @@ -271,6 +271,7 @@ struct usbip_device { /* lock for status */ spinlock_t lock; + int sockfd; struct socket *tcp_socket; struct task_struct *tcp_rx; diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c index b96e5b1892696..c287ccc78fde8 100644 --- a/drivers/usb/usbip/vhci_sysfs.c +++ b/drivers/usb/usbip/vhci_sysfs.c @@ -49,13 +49,17 @@ static ssize_t status_show_vhci(int pdev_nr, char *out) /* * output example: - * port sta spd dev socket local_busid - * 0000 004 000 00000000 c5a7bb80 1-2.3 - * 0001 004 000 00000000 d8cee980 2-3.4 + * port sta spd dev sockfd local_busid + * 0000 004 000 00000000 000003 1-2.3 + * 0001 004 000 00000000 000004 2-3.4 * - * IP address can be retrieved from a socket pointer address by looking - * up /proc/net/{tcp,tcp6}. Also, a userland program may remember a - * port number and its peer IP address. + * Output includes socket fd instead of socket pointer address to + * avoid leaking kernel memory address in: + * /sys/devices/platform/vhci_hcd.0/status and in debug output. + * The socket pointer address is not used at the moment and it was + * made visible as a convenient way to find IP address from socket + * pointer address by looking up /proc/net/{tcp,tcp6}. As this opens + * a security hole, the change is made to use sockfd instead. */ for (i = 0; i < VHCI_HC_PORTS; i++) { struct vhci_device *vdev = &vhci->vdev[i]; @@ -68,13 +72,13 @@ static ssize_t status_show_vhci(int pdev_nr, char *out) if (vdev->ud.status == VDEV_ST_USED) { out += sprintf(out, "%03u %08x ", vdev->speed, vdev->devid); - out += sprintf(out, "%16p %s", - vdev->ud.tcp_socket, + out += sprintf(out, "%06u %s", + vdev->ud.sockfd, dev_name(&vdev->udev->dev)); } else { out += sprintf(out, "000 00000000 "); - out += sprintf(out, "0000000000000000 0-0"); + out += sprintf(out, "000000 0-0"); } out += sprintf(out, "\n"); @@ -125,7 +129,7 @@ static ssize_t status_show(struct device *dev, int pdev_nr; out += sprintf(out, - "port sta spd dev socket local_busid\n"); + "port sta spd dev sockfd local_busid\n"); pdev_nr = status_name_to_id(attr->attr.name); if (pdev_nr < 0) @@ -324,6 +328,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->devid = devid; vdev->speed = speed; + vdev->ud.sockfd = sockfd; vdev->ud.tcp_socket = socket; vdev->ud.status = VDEV_ST_NOTASSIGNED; diff --git a/tools/usb/usbip/libsrc/vhci_driver.c b/tools/usb/usbip/libsrc/vhci_driver.c index ad9204773533f..1274f326242ca 100644 --- a/tools/usb/usbip/libsrc/vhci_driver.c +++ b/tools/usb/usbip/libsrc/vhci_driver.c @@ -55,12 +55,12 @@ static int parse_status(const char *value) while (*c != '\0') { int port, status, speed, devid; - unsigned long socket; + int sockfd; char lbusid[SYSFS_BUS_ID_SIZE]; - ret = sscanf(c, "%d %d %d %x %lx %31s\n", + ret = sscanf(c, "%d %d %d %x %u %31s\n", &port, &status, &speed, - &devid, &socket, lbusid); + &devid, &sockfd, lbusid); if (ret < 5) { dbg("sscanf failed: %d", ret); @@ -69,7 +69,7 @@ static int parse_status(const char *value) dbg("port %d status %d speed %d devid %x", port, status, speed, devid); - dbg("socket %lx lbusid %s", socket, lbusid); + dbg("sockfd %u lbusid %s", sockfd, lbusid); /* if a device is connected, look at it */ From 853c39f239eb89cfd91ee03257c624aaac0575c6 Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Mon, 27 Feb 2017 10:31:04 +0200 Subject: [PATCH 005/247] usbip: Fix implicit fallthrough warning commit cfd6ed4537a9e938fa76facecd4b9cd65b6d1563 upstream. GCC 7 now warns when switch statements fall through implicitly, and with -Werror enabled in configure.ac, that makes these tools unbuildable. We fix this by notifying the compiler that this particular case statement is meant to fall through. Reviewed-by: Peter Senna Tschudin Signed-off-by: Jonathan Dieter Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/src/usbip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/usb/usbip/src/usbip.c b/tools/usb/usbip/src/usbip.c index d7599d9435298..73d8eee8130b3 100644 --- a/tools/usb/usbip/src/usbip.c +++ b/tools/usb/usbip/src/usbip.c @@ -176,6 +176,8 @@ int main(int argc, char *argv[]) break; case '?': printf("usbip: invalid option\n"); + /* Terminate after printing error */ + /* FALLTHRU */ default: usbip_usage(); goto out; From 69e78e7214e39c084fcafdfb83b4d13d21b0008b Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Mon, 27 Feb 2017 10:31:03 +0200 Subject: [PATCH 006/247] usbip: Fix potential format overflow in userspace tools commit e5dfa3f902b9a642ae8c6997d57d7c41e384a90b upstream. The usbip userspace tools call sprintf()/snprintf() and don't check for the return value which can lead the paths to overflow, truncating the final file in the path. More urgently, GCC 7 now warns that these aren't checked with -Wformat-overflow, and with -Werror enabled in configure.ac, that makes these tools unbuildable. This patch fixes these problems by replacing sprintf() with snprintf() in one place and adding checks for the return value of snprintf(). Reviewed-by: Peter Senna Tschudin Signed-off-by: Jonathan Dieter Acked-by: Shuah Khan Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/libsrc/usbip_common.c | 9 ++++++- tools/usb/usbip/libsrc/usbip_host_common.c | 28 ++++++++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/tools/usb/usbip/libsrc/usbip_common.c b/tools/usb/usbip/libsrc/usbip_common.c index ac73710473dea..1517a232ab187 100644 --- a/tools/usb/usbip/libsrc/usbip_common.c +++ b/tools/usb/usbip/libsrc/usbip_common.c @@ -215,9 +215,16 @@ int read_usb_interface(struct usbip_usb_device *udev, int i, struct usbip_usb_interface *uinf) { char busid[SYSFS_BUS_ID_SIZE]; + int size; struct udev_device *sif; - sprintf(busid, "%s:%d.%d", udev->busid, udev->bConfigurationValue, i); + size = snprintf(busid, sizeof(busid), "%s:%d.%d", + udev->busid, udev->bConfigurationValue, i); + if (size < 0 || (unsigned int)size >= sizeof(busid)) { + err("busid length %i >= %lu or < 0", size, + (long unsigned)sizeof(busid)); + return -1; + } sif = udev_device_new_from_subsystem_sysname(udev_context, "usb", busid); if (!sif) { diff --git a/tools/usb/usbip/libsrc/usbip_host_common.c b/tools/usb/usbip/libsrc/usbip_host_common.c index 9d415228883de..6ff7b601f8545 100644 --- a/tools/usb/usbip/libsrc/usbip_host_common.c +++ b/tools/usb/usbip/libsrc/usbip_host_common.c @@ -40,13 +40,20 @@ struct udev *udev_context; static int32_t read_attr_usbip_status(struct usbip_usb_device *udev) { char status_attr_path[SYSFS_PATH_MAX]; + int size; int fd; int length; char status; int value = 0; - snprintf(status_attr_path, SYSFS_PATH_MAX, "%s/usbip_status", - udev->path); + size = snprintf(status_attr_path, sizeof(status_attr_path), + "%s/usbip_status", udev->path); + if (size < 0 || (unsigned int)size >= sizeof(status_attr_path)) { + err("usbip_status path length %i >= %lu or < 0", size, + (long unsigned)sizeof(status_attr_path)); + return -1; + } + fd = open(status_attr_path, O_RDONLY); if (fd < 0) { @@ -218,6 +225,7 @@ int usbip_export_device(struct usbip_exported_device *edev, int sockfd) { char attr_name[] = "usbip_sockfd"; char sockfd_attr_path[SYSFS_PATH_MAX]; + int size; char sockfd_buff[30]; int ret; @@ -237,10 +245,20 @@ int usbip_export_device(struct usbip_exported_device *edev, int sockfd) } /* only the first interface is true */ - snprintf(sockfd_attr_path, sizeof(sockfd_attr_path), "%s/%s", - edev->udev.path, attr_name); + size = snprintf(sockfd_attr_path, sizeof(sockfd_attr_path), "%s/%s", + edev->udev.path, attr_name); + if (size < 0 || (unsigned int)size >= sizeof(sockfd_attr_path)) { + err("exported device path length %i >= %lu or < 0", size, + (long unsigned)sizeof(sockfd_attr_path)); + return -1; + } - snprintf(sockfd_buff, sizeof(sockfd_buff), "%d\n", sockfd); + size = snprintf(sockfd_buff, sizeof(sockfd_buff), "%d\n", sockfd); + if (size < 0 || (unsigned int)size >= sizeof(sockfd_buff)) { + err("socket length %i >= %lu or < 0", size, + (long unsigned)sizeof(sockfd_buff)); + return -1; + } ret = write_sysfs_attribute(sockfd_attr_path, sockfd_buff, strlen(sockfd_buff)); From 40bf2c0c1c9ec9c3a17afac43fcd18b39759defd Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: [PATCH 007/247] can: af_can: can_rcv(): replace WARN_ONCE by pr_warn_once commit 8cb68751c115d176ec851ca56ecfbb411568c9e8 upstream. If an invalid CAN frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+4386709c0c1284dca827@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Signed-off-by: Greg Kroah-Hartman --- net/can/af_can.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 5488e4a6ccd06..6ff7df79e006a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -722,13 +722,12 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!net_eq(dev_net(dev), &init_net))) goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN, - "PF_CAN: dropped non conform CAN skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); goto drop; + } can_receive(skb, dev); return NET_RX_SUCCESS; From 41e4aa17bc02430db764b748e990bdc392347f0d Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: [PATCH 008/247] can: af_can: canfd_rcv(): replace WARN_ONCE by pr_warn_once commit d4689846881d160a4d12a514e991a740bcb5d65a upstream. If an invalid CANFD frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+e3b775f40babeff6e68b@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Signed-off-by: Greg Kroah-Hartman --- net/can/af_can.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 6ff7df79e006a..ac1552d8b4ad1 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -745,13 +745,12 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!net_eq(dev_net(dev), &init_net))) goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN, - "PF_CAN: dropped non conform CAN FD skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); goto drop; + } can_receive(skb, dev); return NET_RX_SUCCESS; From 45ee9d5e97a4c6814a166c4ddf5c7c3764084270 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 4 Jan 2018 18:24:33 +0000 Subject: [PATCH 009/247] KVM: arm/arm64: Check pagesize when allocating a hugepage at Stage 2 commit c507babf10ead4d5c8cca704539b170752a8ac84 upstream. KVM only supports PMD hugepages at stage 2 but doesn't actually check that the provided hugepage memory pagesize is PMD_SIZE before populating stage 2 entries. In cases where the backing hugepage size is smaller than PMD_SIZE (such as when using contiguous hugepages), KVM can end up creating stage 2 mappings that extend beyond the supplied memory. Fix this by checking for the pagesize of userspace vma before creating PMD hugepage at stage 2. Fixes: 66b3923a1a0f77a ("arm64: hugetlb: add support for PTE contiguous bit") Signed-off-by: Punit Agrawal Cc: Marc Zyngier Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 2206e0e00934c..2a35c1963f6d8 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1284,7 +1284,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (is_vm_hugetlb_page(vma) && !logging_active) { + if (vma_kernel_pagesize(vma) && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { From 318e17d09cbc40593ce42194ece3ecd1a464cc5e Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Tue, 25 Apr 2017 16:44:03 -0500 Subject: [PATCH 010/247] Prevent timer value 0 for MWAITX commit 88d879d29f9cc0de2d930b584285638cdada6625 upstream. Newer hardware has uncovered a bug in the software implementation of using MWAITX for the delay function. A value of 0 for the timer is meant to indicate that a timeout will not be used to exit MWAITX. On newer hardware this can result in MWAITX never returning, resulting in NMI soft lockup messages being printed. On older hardware, some of the other conditions under which MWAITX can exit masked this issue. The AMD APM does not currently document this and will be updated. Please refer to http://marc.info/?l=kvm&m=148950623231140 for information regarding NMI soft lockup messages on an AMD Ryzen 1800X. This has been root-caused as a 0 passed to MWAITX causing it to wait indefinitely. This change has the added benefit of avoiding the unnecessary setup of MONITORX/MWAITX when the delay value is zero. Signed-off-by: Janakarajan Natarajan Link: http://lkml.kernel.org/r/1493156643-29366-1-git-send-email-Janakarajan.Natarajan@amd.com Signed-off-by: Thomas Gleixner Signed-off-by: Davidlohr Bueso Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/delay.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 073d1f1a620bd..9758524ee99f7 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops) { u64 start, end, delay, loops = __loops; + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (loops == 0) + return; + start = rdtsc_ordered(); for (;;) { From f5aaa5a2836d86e3b6559200422c153a4dfb6d66 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 28 Oct 2016 09:45:28 +0100 Subject: [PATCH 011/247] drivers: base: cacheinfo: fix x86 with CONFIG_OF enabled commit fac51482577d5e05bbb0efa8d602a3c2111098bf upstream. With CONFIG_OF enabled on x86, we get the following error on boot: " Failed to find cpu0 device node Unable to detect cache hierarchy from DT for CPU 0 " and the cacheinfo fails to get populated in the corresponding sysfs entries. This is because cache_setup_of_node looks for of_node for setting up the shared cpu_map without checking that it's already populated in the architecture specific callback. In order to indicate that the shared cpu_map is already populated, this patch introduces a boolean `cpu_map_populated` in struct cpu_cacheinfo that can be used by the generic code to skip cache_shared_cpu_map_setup. This patch also sets that boolean for x86. Cc: Greg Kroah-Hartman Signed-off-by: Sudeep Holla Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 ++ drivers/base/cacheinfo.c | 3 +++ include/linux/cacheinfo.h | 1 + 3 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e427..be63371565021 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index e9fd32e916689..ecde8957835ad 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -106,6 +106,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) unsigned int index; int ret; + if (this_cpu_ci->cpu_map_populated) + return 0; + ret = cache_setup_of_node(cpu); if (ret) return ret; diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2189935075b48..a951fd10aaaad 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -71,6 +71,7 @@ struct cpu_cacheinfo { struct cacheinfo *info_list; unsigned int num_levels; unsigned int num_leaves; + bool cpu_map_populated; }; /* From 1d8c402e0c46ce630746e081c904068af455b1e5 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 28 Oct 2016 09:45:29 +0100 Subject: [PATCH 012/247] drivers: base: cacheinfo: fix boot error message when acpi is enabled commit 55877ef45fbd7f975d078426866b7d1a2435dcc3 upstream. ARM64 enables both CONFIG_OF and CONFIG_ACPI and the firmware can pass both ACPI tables and the device tree. Based on the kernel parameter, one of the two will be chosen. If acpi is enabled, then device tree is not unflattened. Currently ARM64 platforms report: " Failed to find cpu0 device node Unable to detect cache hierarchy from DT for CPU 0 " which is incorrect when booting with ACPI. Also latest ACPI v6.1 has no support for cache properties/hierarchy. This patch adds check for unflattened device tree and also returns as "not supported" if ACPI is runtime enabled. It also removes the reference to DT from the error message as the cache hierarchy can be detected from the firmware(OF/DT/ACPI) Cc: Greg Kroah-Hartman Signed-off-by: Sudeep Holla Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Greg Kroah-Hartman --- drivers/base/cacheinfo.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index ecde8957835ad..70e13cf06ed09 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -16,6 +16,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#include #include #include #include @@ -104,12 +105,16 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf, *sib_leaf; unsigned int index; - int ret; + int ret = 0; if (this_cpu_ci->cpu_map_populated) return 0; - ret = cache_setup_of_node(cpu); + if (of_have_populated_dt()) + ret = cache_setup_of_node(cpu); + else if (!acpi_disabled) + /* No cache property/hierarchy support yet in ACPI */ + ret = -ENOTSUPP; if (ret) return ret; @@ -206,8 +211,7 @@ static int detect_cache_attributes(unsigned int cpu) */ ret = cache_shared_cpu_map_setup(cpu); if (ret) { - pr_warn("Unable to detect cache hierarchy from DT for CPU %d\n", - cpu); + pr_warn("Unable to detect cache hierarchy for CPU %d\n", cpu); goto free_ci; } return 0; From c57664bd12997742da5e12556e328e1ec0b5b654 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:49:51 -0700 Subject: [PATCH 013/247] mm/mmap.c: do not blow on PROT_NONE MAP_FIXED holes in the stack commit 561b5e0709e4a248c67d024d4d94b6e31e3edf2f upstream. Commit 1be7107fbe18 ("mm: larger stack guard gap, between vmas") has introduced a regression in some rust and Java environments which are trying to implement their own stack guard page. They are punching a new MAP_FIXED mapping inside the existing stack Vma. This will confuse expand_{downwards,upwards} into thinking that the stack expansion would in fact get us too close to an existing non-stack vma which is a correct behavior wrt safety. It is a real regression on the other hand. Let's work around the problem by considering PROT_NONE mapping as a part of the stack. This is a gros hack but overflowing to such a mapping would trap anyway an we only can hope that usespace knows what it is doing and handle it propely. Fixes: 1be7107fbe18 ("mm: larger stack guard gap, between vmas") Link: http://lkml.kernel.org/r/20170705182849.GA18027@dhcp22.suse.cz Signed-off-by: Michal Hocko Debugged-by: Vlastimil Babka Cc: Ben Hutchings Cc: Willy Tarreau Cc: Oleg Nesterov Cc: Rik van Riel Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5b48adb4aa560..45ac5b973459f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2240,7 +2240,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr) { + if (next && next->vm_start < gap_addr && + (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2324,7 +2325,8 @@ int expand_downwards(struct vm_area_struct *vma, if (gap_addr > address) return -ENOMEM; prev = vma->vm_prev; - if (prev && prev->vm_end > gap_addr) { + if (prev && prev->vm_end > gap_addr && + (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(prev->vm_flags & VM_GROWSDOWN)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ From bc0e2174b092638f1146d792a8be71ab90a7e56d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 12 May 2017 15:46:26 -0700 Subject: [PATCH 014/247] hwpoison, memcg: forcibly uncharge LRU pages commit 18365225f0440d09708ad9daade2ec11275c3df9 upstream. Laurent Dufour has noticed that hwpoinsoned pages are kept charged. In his particular case he has hit a bad_page("page still charged to cgroup") when onlining a hwpoison page. While this looks like something that shouldn't happen in the first place because onlining hwpages and returning them to the page allocator makes only little sense it shows a real problem. hwpoison pages do not get freed usually so we do not uncharge them (at least not since commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API")). Each charge pins memcg (since e8ea14cc6ead ("mm: memcontrol: take a css reference for each charged page")) as well and so the mem_cgroup and the associated state will never go away. Fix this leak by forcibly uncharging a LRU hwpoisoned page in delete_from_lru_cache(). We also have to tweak uncharge_list because it cannot rely on zero ref count for these pages. [akpm@linux-foundation.org: coding-style fixes] Fixes: 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") Link: http://lkml.kernel.org/r/20170502185507.GB19165@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Laurent Dufour Tested-by: Laurent Dufour Reviewed-by: Balbir Singh Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 2 +- mm/memory-failure.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2a800c4a39bdb..50088150fc173 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5531,7 +5531,7 @@ static void uncharge_list(struct list_head *page_list) next = page->lru.next; VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); if (!page->mem_cgroup) continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ce7d416edab7f..5aa71a82ca733 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -535,6 +535,13 @@ static int delete_from_lru_cache(struct page *p) */ ClearPageActive(p); ClearPageUnevictable(p); + + /* + * Poisoned page might never drop its ref count to 0 so we have + * to uncharge it manually from its memcg. + */ + mem_cgroup_uncharge(p); + /* * drop the page count elevated by isolate_lru_page() */ From 714c19ef57a5271c14ab6d1b5878f5662a0b3034 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 10 Jul 2017 15:49:44 -0700 Subject: [PATCH 015/247] cma: fix calculation of aligned offset commit e048cb32f69038aa1c8f11e5c1b331be4181659d upstream. The align_offset parameter is used by bitmap_find_next_zero_area_off() to represent the offset of map's base from the previous alignment boundary; the function ensures that the returned index, plus the align_offset, honors the specified align_mask. The logic introduced by commit b5be83e308f7 ("mm: cma: align to physical address, not CMA region position") has the cma driver calculate the offset to the *next* alignment boundary. In most cases, the base alignment is greater than that specified when making allocations, resulting in a zero offset whether we align up or down. In the example given with the commit, the base alignment (8MB) was half the requested alignment (16MB) so the math also happened to work since the offset is 8MB in both directions. However, when requesting allocations with an alignment greater than twice that of the base, the returned index would not be correctly aligned. Also, the align_order arguments of cma_bitmap_aligned_mask() and cma_bitmap_aligned_offset() should not be negative so the argument type was made unsigned. Fixes: b5be83e308f7 ("mm: cma: align to physical address, not CMA region position") Link: http://lkml.kernel.org/r/20170628170742.2895-1-opendmb@gmail.com Signed-off-by: Angus Clark Signed-off-by: Doug Berger Acked-by: Gregory Fong Cc: Doug Berger Cc: Angus Clark Cc: Laura Abbott Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Cc: Lucas Stach Cc: Catalin Marinas Cc: Shiraz Hashim Cc: Jaewon Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Vlastimil Babka Signed-off-by: Greg Kroah-Hartman --- mm/cma.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index c960459eda7e6..397687fc51f97 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -54,7 +54,7 @@ unsigned long cma_get_size(const struct cma *cma) } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, - int align_order) + unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -62,17 +62,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, } /* - * Find a PFN aligned to the specified order and return an offset represented in - * order_per_bits. + * Find the offset of the base PFN from the specified align_order. + * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, - int align_order) + unsigned int align_order) { - if (align_order <= cma->order_per_bit) - return 0; - - return (ALIGN(cma->base_pfn, (1UL << align_order)) - - cma->base_pfn) >> cma->order_per_bit; + return (cma->base_pfn & ((1UL << align_order) - 1)) + >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, From 685cce58f1c2e4fb5dbc891edf3e1bbca217ed3e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Nov 2017 17:38:30 -0800 Subject: [PATCH 016/247] mm, page_alloc: fix potential false positive in __zone_watermark_ok commit b050e3769c6b4013bb937e879fc43bf1847ee819 upstream. Since commit 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations"), __zone_watermark_ok() check for high-order allocations will shortcut per-migratetype free list checks for ALLOC_HARDER allocations, and return true as long as there's free page of any migratetype. The intention is that ALLOC_HARDER can allocate from MIGRATE_HIGHATOMIC free lists, while normal allocations can't. However, as a side effect, the watermark check will then also return true when there are pages only on the MIGRATE_ISOLATE list, or (prior to CMA conversion to ZONE_MOVABLE) on the MIGRATE_CMA list. Since the allocation cannot actually obtain isolated pages, and might not be able to obtain CMA pages, this can result in a false positive. The condition should be rare and perhaps the outcome is not a fatal one. Still, it's better if the watermark check is correct. There also shouldn't be a performance tradeoff here. Link: http://lkml.kernel.org/r/20171102125001.23708-1-vbabka@suse.cz Fixes: 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations") Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Rik van Riel Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fbc38888252b1..546713b3f7620 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2821,9 +2821,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -2835,6 +2832,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } From 542cde0e3cc27bd4d6cbfa596d9278d3c97bc193 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 14 Dec 2016 15:06:07 -0800 Subject: [PATCH 017/247] ipc: msg, make msgrcv work with LONG_MIN commit 999898355e08ae3b92dfd0a08db706e0c6703d30 upstream. When LONG_MIN is passed to msgrcv, one would expect to recieve any message. But convert_mode does *msgtyp = -*msgtyp and -LONG_MIN is undefined. In particular, with my gcc -LONG_MIN produces -LONG_MIN again. So handle this case properly by assigning LONG_MAX to *msgtyp if LONG_MIN was specified as msgtyp to msgrcv. This code: long msg[] = { 100, 200 }; int m = msgget(IPC_PRIVATE, IPC_CREAT | 0644); msgsnd(m, &msg, sizeof(msg), 0); msgrcv(m, &msg, sizeof(msg), LONG_MIN, 0); produces currently nothing: msgget(IPC_PRIVATE, IPC_CREAT|0644) = 65538 msgsnd(65538, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, 0) = 0 msgrcv(65538, ... Except a UBSAN warning: UBSAN: Undefined behaviour in ipc/msg.c:745:13 negation of -9223372036854775808 cannot be represented in type 'long int': With the patch, I see what I expect: msgget(IPC_PRIVATE, IPC_CREAT|0644) = 0 msgsnd(0, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, 0) = 0 msgrcv(0, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, -9223372036854775808, 0) = 16 Link: http://lkml.kernel.org/r/20161024082633.10148-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/msg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ipc/msg.c b/ipc/msg.c index e12307d0c920c..ff10d43b51848 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -763,7 +763,10 @@ static inline int convert_mode(long *msgtyp, int msgflg) if (*msgtyp == 0) return SEARCH_ANY; if (*msgtyp < 0) { - *msgtyp = -*msgtyp; + if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ + *msgtyp = LONG_MAX; + else + *msgtyp = -*msgtyp; return SEARCH_LESSEQUAL; } if (msgflg & MSG_EXCEPT) From 3a53accd9c397f836858defa475720a65b5dd662 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Dec 2016 02:27:31 +0100 Subject: [PATCH 018/247] ACPI / scan: Prefer devices without _HID/_CID for _ADR matching commit c2a6bbaf0c5f90463a7011a295bbdb7e33c80b51 upstream. The way acpi_find_child_device() works currently is that, if there are two (or more) devices with the same _ADR value in the same namespace scope (which is not specifically allowed by the spec and the OS behavior in that case is not defined), the first one of them found to be present (with the help of _STA) will be returned. This covers the majority of cases, but is not sufficient if some of the devices in question have a _HID (or _CID) returning some valid ACPI/PNP device IDs (which is disallowed by the spec) and the ASL writers' expectation appears to be that the OS will match devices without a valid ACPI/PNP device ID against a given bus address first. To cover this special case as well, modify find_child_checks() to prefer devices without ACPI/PNP device IDs over devices that have them. Suggested-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki Tested-by: Hans de Goede Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/glue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 73c9c7fa90014..f06317d6fc38e 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -99,13 +99,13 @@ static int find_child_checks(struct acpi_device *adev, bool check_children) return -ENODEV; /* - * If the device has a _HID (or _CID) returning a valid ACPI/PNP - * device ID, it is better to make it look less attractive here, so that - * the other device with the same _ADR value (that may not have a valid - * device ID) can be matched going forward. [This means a second spec - * violation in a row, so whatever we do here is best effort anyway.] + * If the device has a _HID returning a valid ACPI/PNP device ID, it is + * better to make it look less attractive here, so that the other device + * with the same _ADR value (that may not have a valid device ID) can be + * matched going forward. [This means a second spec violation in a row, + * so whatever we do here is best effort anyway.] */ - return sta_present && list_empty(&adev->pnp.ids) ? + return sta_present && !adev->pnp.type.platform_id ? FIND_CHILD_MAX_SCORE : FIND_CHILD_MIN_SCORE; } From 2915f16bdce204621695e7a0dfcd5f73b120cccb Mon Sep 17 00:00:00 2001 From: Seunghun Han Date: Wed, 26 Apr 2017 16:18:08 +0800 Subject: [PATCH 019/247] ACPICA: Namespace: fix operand cache leak commit 3b2d69114fefa474fca542e51119036dceb4aa6f upstream. ACPICA commit a23325b2e583556eae88ed3f764e457786bf4df6 I found some ACPI operand cache leaks in ACPI early abort cases. Boot log of ACPI operand cache leak is as follows: >[ 0.174332] ACPI: Added _OSI(Module Device) >[ 0.175504] ACPI: Added _OSI(Processor Device) >[ 0.176010] ACPI: Added _OSI(3.0 _SCP Extensions) >[ 0.177032] ACPI: Added _OSI(Processor Aggregator Device) >[ 0.178284] ACPI: SCI (IRQ16705) allocation failed >[ 0.179352] ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20160930/evevent-131) >[ 0.180008] ACPI: Unable to start the ACPI Interpreter >[ 0.181125] ACPI Error: Could not remove SCI handler (20160930/evmisc-281) >[ 0.184068] kmem_cache_destroy Acpi-Operand: Slab cache still has objects >[ 0.185358] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.10.0-rc3 #2 >[ 0.186820] Hardware name: innotek gmb_h virtual_box/virtual_box, BIOS virtual_box 12/01/2006 >[ 0.188000] Call Trace: >[ 0.188000] ? dump_stack+0x5c/0x7d >[ 0.188000] ? kmem_cache_destroy+0x224/0x230 >[ 0.188000] ? acpi_sleep_proc_init+0x22/0x22 >[ 0.188000] ? acpi_os_delete_cache+0xa/0xd >[ 0.188000] ? acpi_ut_delete_caches+0x3f/0x7b >[ 0.188000] ? acpi_terminate+0x5/0xf >[ 0.188000] ? acpi_init+0x288/0x32e >[ 0.188000] ? __class_create+0x4c/0x80 >[ 0.188000] ? video_setup+0x7a/0x7a >[ 0.188000] ? do_one_initcall+0x4e/0x1b0 >[ 0.188000] ? kernel_init_freeable+0x194/0x21a >[ 0.188000] ? rest_init+0x80/0x80 >[ 0.188000] ? kernel_init+0xa/0x100 >[ 0.188000] ? ret_from_fork+0x25/0x30 When early abort is occurred due to invalid ACPI information, Linux kernel terminates ACPI by calling acpi_terminate() function. The function calls acpi_ns_terminate() function to delete namespace data and ACPI operand cache (acpi_gbl_module_code_list). But the deletion code in acpi_ns_terminate() function is wrapped in ACPI_EXEC_APP definition, therefore the code is only executed when the definition exists. If the define doesn't exist, ACPI operand cache (acpi_gbl_module_code_list) is leaked, and stack dump is shown in kernel log. This causes a security threat because the old kernel (<= 4.9) shows memory locations of kernel functions in stack dump, therefore kernel ASLR can be neutralized. To fix ACPI operand leak for enhancing security, I made a patch which removes the ACPI_EXEC_APP define in acpi_ns_terminate() function for executing the deletion code unconditionally. Link: https://github.com/acpica/acpica/commit/a23325b2 Signed-off-by: Seunghun Han Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki Acked-by: Lee, Chun-Yi Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpica/nsutils.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c index 691814dfed313..943702dd9517e 100644 --- a/drivers/acpi/acpica/nsutils.c +++ b/drivers/acpi/acpica/nsutils.c @@ -594,25 +594,20 @@ struct acpi_namespace_node *acpi_ns_validate_handle(acpi_handle handle) void acpi_ns_terminate(void) { acpi_status status; + union acpi_operand_object *prev; + union acpi_operand_object *next; ACPI_FUNCTION_TRACE(ns_terminate); -#ifdef ACPI_EXEC_APP - { - union acpi_operand_object *prev; - union acpi_operand_object *next; + /* Delete any module-level code blocks */ - /* Delete any module-level code blocks */ - - next = acpi_gbl_module_code_list; - while (next) { - prev = next; - next = next->method.mutex; - prev->method.mutex = NULL; /* Clear the Mutex (cheated) field */ - acpi_ut_remove_reference(prev); - } + next = acpi_gbl_module_code_list; + while (next) { + prev = next; + next = next->method.mutex; + prev->method.mutex = NULL; /* Clear the Mutex (cheated) field */ + acpi_ut_remove_reference(prev); } -#endif /* * Free the entire namespace -- all nodes and all objects From 2c3184ea80322347287bc7e57f782d77f478e73c Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 3 Dec 2017 12:12:45 -0800 Subject: [PATCH 020/247] netfilter: nfnetlink_cthelper: Add missing permission checks commit 4b380c42f7d00a395feede754f0bc2292eebe6e5 upstream. The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, nfnl_cthelper_list is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: $ nfct helper list nfct v1.4.4: netlink error: Operation not permitted $ vpnns -- nfct helper list { .name = ftp, .queuenum = 0, .l3protonum = 2, .l4protonum = 6, .priv_data_len = 24, .status = enabled, }; Add capable() checks in nfnetlink_cthelper, as this is cleaner than trying to generalize the solution. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 28d065394c09c..3f499126727c8 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,9 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -595,6 +599,9 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -661,6 +668,9 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); From 898eeca02a55e354c42a7aa5cdfebf16c3742f44 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 5 Dec 2017 15:42:41 -0800 Subject: [PATCH 021/247] netfilter: xt_osf: Add missing permission checks commit 916a27901de01446bcf57ecca4783f6cff493309 upstream. The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, xt_osf_fingers is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: vpnns -- nfnl_osf -f /tmp/pf.os vpnns -- nfnl_osf -f /tmp/pf.os -d These non-root operations successfully modify the systemwide OS fingerprint list. Add new capable() checks so that they can't. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/xt_osf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 2455b69b58104..b589a62e68a29 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -69,6 +70,9 @@ static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -113,6 +117,9 @@ static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; From b7d25282b75ea7da56ca7b1a11f7a9c4970fc5e0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 22 Jun 2017 16:47:34 -0400 Subject: [PATCH 022/247] reiserfs: fix race in prealloc discard commit 08db141b5313ac2f64b844fb5725b8d81744b417 upstream. The main loop in __discard_prealloc is protected by the reiserfs write lock which is dropped across schedules like the BKL it replaced. The problem is that it checks the value, calls a routine that schedules, and then adjusts the state. As a result, two threads that are calling reiserfs_prealloc_discard at the same time can race when one calls reiserfs_free_prealloc_block, the lock is dropped, and the other calls reiserfs_free_prealloc_block with the same block number. In the right circumstances, it can cause the prealloc count to go negative. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/bitmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index dc198bc64c61d..73705d4bb0696 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -513,9 +513,17 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, "inode has negative prealloc blocks count."); #endif while (ei->i_prealloc_count > 0) { - reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); - ei->i_prealloc_block++; + b_blocknr_t block_to_free; + + /* + * reiserfs_free_prealloc_block can drop the write lock, + * which could allow another caller to free the same block. + * We can protect against it by modifying the prealloc + * state before calling it. + */ + block_to_free = ei->i_prealloc_block++; ei->i_prealloc_count--; + reiserfs_free_prealloc_block(th, inode, block_to_free); dirty = 1; } if (dirty) From 0ccfbd4d6f02ff010dcc59e439f19864ad3a4efa Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 22 Jun 2017 16:35:04 -0400 Subject: [PATCH 023/247] reiserfs: don't preallocate blocks for extended attributes commit 54930dfeb46e978b447af0fb8ab4e181c1bf9d7a upstream. Most extended attributes will fit in a single block. More importantly, we drop the reference to the inode while holding the transaction open so the preallocated blocks aren't released. As a result, the inode may be evicted before it's removed from the transaction's prealloc list which can cause memory corruption. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 73705d4bb0696..edc8ef78b63fc 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1136,7 +1136,7 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) hint->prealloc_size = 0; if (!hint->formatted_node && hint->preallocate) { - if (S_ISREG(hint->inode->i_mode) + if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode) && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options. preallocmin * hint->inode->i_sb->s_blocksize) From 7b50205cf8b9c7bd10572c97f7ca9fecfd9aec7b Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 13 Jun 2017 13:35:51 +0200 Subject: [PATCH 024/247] fs/fcntl: f_setown, avoid undefined behaviour commit fc3dc67471461c0efcb1ed22fb7595121d65fad9 upstream. fcntl(0, F_SETOWN, 0x80000000) triggers: UBSAN: Undefined behaviour in fs/fcntl.c:118:7 negation of -2147483648 cannot be represented in type 'int': CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1 ... Call Trace: ... [] ? f_setown+0x1d8/0x200 [] ? SyS_fcntl+0x999/0xf30 [] ? entry_SYSCALL_64_fastpath+0x23/0xc1 Fix that by checking the arg parameter properly (against INT_MAX) before "who = -who". And return immediatelly with -EINVAL in case it is wrong. Note that according to POSIX we can return EINVAL: http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html [EINVAL] The cmd argument is F_SETOWN and the value of the argument is not valid as a process or process group identifier. [v2] returns an error, v1 used to fail silently [v3] implement proper check for the bad value INT_MIN Signed-off-by: Jiri Slaby Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Jeff Layton Signed-off-by: Greg Kroah-Hartman --- fs/fcntl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 1493ceb0477dd..ec03cf620fd7a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -114,6 +114,10 @@ void f_setown(struct file *filp, unsigned long arg, int force) int who = arg; type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return; + type = PIDTYPE_PGID; who = -who; } From c41bb027ed63db9e7a6b2c4c3c5e2e4df1ccb8cf Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 9 Oct 2017 13:33:19 +0200 Subject: [PATCH 025/247] scsi: libiscsi: fix shifting of DID_REQUEUE host byte commit eef9ffdf9cd39b2986367bc8395e2772bc1284ba upstream. The SCSI host byte should be shifted left by 16 in order to have scsi_decide_disposition() do the right thing (.i.e. requeue the command). Signed-off-by: Johannes Thumshirn Fixes: 661134ad3765 ("[SCSI] libiscsi, bnx2i: make bound ep check common") Cc: Lee Duncan Cc: Hannes Reinecke Cc: Bart Van Assche Cc: Chris Leech Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libiscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index a530f08592cde..4abd3fce5ab6c 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1727,7 +1727,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { reason = FAILURE_SESSION_IN_RECOVERY; - sc->result = DID_REQUEUE; + sc->result = DID_REQUEUE << 16; goto fault; } From e62b0c661f65e985327f4bc542688630265b0311 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Jan 2018 15:28:17 +0100 Subject: [PATCH 026/247] Revert "module: Add retpoline tag to VERMAGIC" commit 5132ede0fe8092b043dae09a7cc32b8ae7272baa upstream. This reverts commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12. Turns out distros do not want to make retpoline as part of their "ABI", so this patch should not have been merged. Sorry Andi, this was my fault, I suggested it when your original patch was the "correct" way of doing this instead. Reported-by: Jiri Kosina Fixes: 6cfb521ac0d5 ("module: Add retpoline tag to VERMAGIC") Acked-by: Andi Kleen Cc: Thomas Gleixner Cc: David Woodhouse Cc: rusty@rustcorp.com.au Cc: arjan.van.de.ven@intel.com Cc: jeyu@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/vermagic.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a3d04934aa965..6f8fbcf10dfb8 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -24,16 +24,10 @@ #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif -#ifdef RETPOLINE -#define MODULE_VERMAGIC_RETPOLINE "retpoline " -#else -#define MODULE_VERMAGIC_RETPOLINE "" -#endif #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC \ - MODULE_VERMAGIC_RETPOLINE + MODULE_ARCH_VERMAGIC From 19a7db1e2ef38865a704ea4dfd178b02a8026ada Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 May 2017 14:51:51 -0700 Subject: [PATCH 027/247] mm: fix 100% CPU kswapd busyloop on unreclaimable nodes commit c73322d098e4b6f5f0f0fa1330bf57e218775539 upstream. Patch series "mm: kswapd spinning on unreclaimable nodes - fixes and cleanups". Jia reported a scenario in which the kswapd of a node indefinitely spins at 100% CPU usage. We have seen similar cases at Facebook. The kernel's current method of judging its ability to reclaim a node (or whether to back off and sleep) is based on the amount of scanned pages in proportion to the amount of reclaimable pages. In Jia's and our scenarios, there are no reclaimable pages in the node, however, and the condition for backing off is never met. Kswapd busyloops in an attempt to restore the watermarks while having nothing to work with. This series reworks the definition of an unreclaimable node based not on scanning but on whether kswapd is able to actually reclaim pages in MAX_RECLAIM_RETRIES (16) consecutive runs. This is the same criteria the page allocator uses for giving up on direct reclaim and invoking the OOM killer. If it cannot free any pages, kswapd will go to sleep and leave further attempts to direct reclaim invocations, which will either make progress and re-enable kswapd, or invoke the OOM killer. Patch #1 fixes the immediate problem Jia reported, the remainder are smaller fixlets, cleanups, and overall phasing out of the old method. Patch #6 is the odd one out. It's a nice cleanup to get_scan_count(), and directly related to #5, but in itself not relevant to the series. If the whole series is too ambitious for 4.11, I would consider the first three patches fixes, the rest cleanups. This patch (of 9): Jia He reports a problem with kswapd spinning at 100% CPU when requesting more hugepages than memory available in the system: $ echo 4000 >/proc/sys/vm/nr_hugepages top - 13:42:59 up 3:37, 1 user, load average: 1.09, 1.03, 1.01 Tasks: 1 total, 1 running, 0 sleeping, 0 stopped, 0 zombie %Cpu(s): 0.0 us, 12.5 sy, 0.0 ni, 85.5 id, 2.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem: 31371520 total, 30915136 used, 456384 free, 320 buffers KiB Swap: 6284224 total, 115712 used, 6168512 free. 48192 cached Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 76 root 20 0 0 0 0 R 100.0 0.000 217:17.29 kswapd3 At that time, there are no reclaimable pages left in the node, but as kswapd fails to restore the high watermarks it refuses to go to sleep. Kswapd needs to back away from nodes that fail to balance. Up until commit 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of nodes") kswapd had such a mechanism. It considered zones whose theoretically reclaimable pages it had reclaimed six times over as unreclaimable and backed away from them. This guard was erroneously removed as the patch changed the definition of a balanced node. However, simply restoring this code wouldn't help in the case reported here: there *are* no reclaimable pages that could be scanned until the threshold is met. Kswapd would stay awake anyway. Introduce a new and much simpler way of backing off. If kswapd runs through MAX_RECLAIM_RETRIES (16) cycles without reclaiming a single page, make it back off from the node. This is the same number of shots direct reclaim takes before declaring OOM. Kswapd will go to sleep on that node until a direct reclaimer manages to reclaim some pages, thus proving the node reclaimable again. [hannes@cmpxchg.org: check kswapd failure against the cumulative nr_reclaimed count] Link: http://lkml.kernel.org/r/20170306162410.GB2090@cmpxchg.org [shakeelb@google.com: fix condition for throttle_direct_reclaim] Link: http://lkml.kernel.org/r/20170314183228.20152-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20170228214007.5621-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Shakeel Butt Reported-by: Jia He Tested-by: Jia He Acked-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Dmitry Shmidt Signed-off-by: Greg Kroah-Hartman --- include/linux/mmzone.h | 2 ++ mm/internal.h | 6 ++++++ mm/page_alloc.c | 9 ++------ mm/vmscan.c | 47 ++++++++++++++++++++++++++++-------------- mm/vmstat.c | 2 +- 5 files changed, 43 insertions(+), 23 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 490f5a83f9476..e3d7754f25f00 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -633,6 +633,8 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; + int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/mm/internal.h b/mm/internal.h index 34a5459e5989e..3e2d016947479 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -73,6 +73,12 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; +/* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + /* * in mm/vmscan.c: */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 546713b3f7620..94018ea5f9352 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3421,12 +3421,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return false; } -/* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. @@ -4385,7 +4379,8 @@ void show_free_areas(unsigned int filter) K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 30a88b945a44f..f118dc23f6624 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2606,6 +2606,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2680,10 +2689,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2820,7 +2825,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2828,6 +2833,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; if (!managed_zone(zone) || @@ -2908,7 +2916,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2930,14 +2938,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -3116,7 +3124,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3130,6 +3138,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; @@ -3216,9 +3228,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3297,7 +3309,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3308,10 +3320,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: /* * Return the order kswapd stopped reclaiming at as @@ -3511,6 +3527,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; + /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; @@ -3781,9 +3801,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 6a088df04b29d..3863b5d6d598f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1421,7 +1421,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); From 42f0aba58e00aedc395460e621896532d009c64f Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Fri, 19 Jan 2018 09:43:39 -0800 Subject: [PATCH 028/247] Input: trackpoint - force 3 buttons if 0 button is reported commit f5d07b9e98022d50720e38aa936fc11c67868ece upstream. Lenovo introduced trackpoint compatible sticks with minimum PS/2 commands. They supposed to reply with 0x02, 0x03, or 0x04 in response to the "Read Extended ID" command, so we would know not to try certain extended commands. Unfortunately even some trackpoints reporting the original IBM version (0x01 firmware 0x0e) now respond with incorrect data to the "Get Extended Buttons" command: thinkpad_acpi: ThinkPad BIOS R0DET87W (1.87 ), EC unknown thinkpad_acpi: Lenovo ThinkPad E470, model 20H1004SGE psmouse serio2: trackpoint: IBM TrackPoint firmware: 0x0e, buttons: 0/0 Since there are no trackpoints without buttons, let's assume the trackpoint has 3 buttons when we get 0 response to the extended buttons query. Signed-off-by: Aaron Ma Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196253 Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/trackpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 7e2dc5e566320..0b49f29bf0dab 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -383,6 +383,9 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) if (trackpoint_read(&psmouse->ps2dev, TP_EXT_BTN, &button_info)) { psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); button_info = 0x33; + } else if (!button_info) { + psmouse_warn(psmouse, "got 0 in extended button data, assuming 3 buttons\n"); + button_info = 0x33; } psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); From d680db722516dbf2d564d2118127270b88b1b663 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 25 Jan 2018 19:39:44 -0500 Subject: [PATCH 029/247] orangefs: fix deadlock; do not write i_size in read_iter commit 6793f1c450b1533a5e9c2493490de771d38b24f9 upstream. After do_readv_writev, the inode cache is invalidated anyway, so i_size will never be read. It will be fetched from the server which will also know about updates from other machines. Fixes deadlock on 32-bit SMP. See https://marc.info/?l=linux-fsdevel&m=151268557427760&w=2 Signed-off-by: Martin Brandenburg Cc: Al Viro Cc: Mike Marshall Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/orangefs/file.c | 7 ++----- fs/orangefs/orangefs-kernel.h | 11 ----------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 02cc6139ec907..5b2cbe567365c 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -446,7 +446,7 @@ ssize_t orangefs_inode_read(struct inode *inode, static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - loff_t pos = *(&iocb->ki_pos); + loff_t pos = iocb->ki_pos; ssize_t rc = 0; BUG_ON(iocb->private); @@ -485,9 +485,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite } } - if (file->f_pos > i_size_read(file->f_mapping->host)) - orangefs_i_size_write(file->f_mapping->host, file->f_pos); - rc = generic_write_checks(iocb, iter); if (rc <= 0) { @@ -501,7 +498,7 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite * pos to the end of the file, so we will wait till now to set * pos... */ - pos = *(&iocb->ki_pos); + pos = iocb->ki_pos; rc = do_readv_writev(ORANGEFS_IO_WRITE, file, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 45dd8f27b2ac3..f28381a7cd12c 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -570,17 +570,6 @@ do { \ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \ } while (0) -static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size) -{ -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_lock(inode); -#endif - i_size_write(inode, i_size); -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) - inode_unlock(inode); -#endif -} - static inline void orangefs_set_timeout(struct dentry *dentry) { unsigned long time = jiffies + orangefs_dcache_timeout_msecs*HZ/1000; From 1be7d46e775c2fbec57bbce7190b66fbb7c58d1b Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sun, 20 Aug 2017 13:26:04 +0200 Subject: [PATCH 030/247] um: link vmlinux with -no-pie commit 883354afbc109c57f925ccc19840055193da0cc0 upstream. Debian's gcc defaults to pie. The global Makefile already defines the -fno-pie option. Link UML dynamic kernel image also with -no-pie to fix the build. Signed-off-by: Thomas Meyer Signed-off-by: Richard Weinberger Cc: Bernie Innocenti Signed-off-by: Greg Kroah-Hartman --- arch/um/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/Makefile b/arch/um/Makefile index 0ca46ededfc73..9c150ccb35d22 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -117,7 +117,7 @@ archheaders: archprepare: include/generated/user_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib +LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie) CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ $(call cc-option, -fno-stack-protector,) \ From 9a0be5afbfbb1d14efdc98a6615fc52082243bd1 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 26 Jan 2018 16:23:02 +0000 Subject: [PATCH 031/247] vsyscall: Fix permissions for emulate mode with KAISER/PTI The backport of KAISER to 4.4 turned vsyscall emulate mode into native mode. Add a vsyscall_pgprot variable to hold the correct page protections, like Borislav and Hugh did for 3.2 and 3.18. Cc: Borislav Petkov Cc: Hugh Dickins Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/vsyscall/vsyscall_64.c | 7 ++++--- arch/x86/include/asm/vsyscall.h | 1 + arch/x86/mm/kaiser.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 6bb7e92c6d505..0174290b28573 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -46,6 +46,7 @@ static enum { EMULATE, NATIVE, NONE } vsyscall_mode = #else EMULATE; #endif +unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL; static int __init vsyscall_setup(char *str) { @@ -336,11 +337,11 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + if (vsyscall_mode != NATIVE) + vsyscall_pgprot = __PAGE_KERNEL_VVAR; if (vsyscall_mode != NONE) __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + __pgprot(vsyscall_pgprot)); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 4865e10dbb550..9ee85066f4072 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,6 +13,7 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); extern bool vsyscall_enabled(void); +extern unsigned long vsyscall_pgprot; #else static inline void map_vsyscall(void) {} static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index a8ade08a9bf5e..ec678aafa3f86 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -344,7 +344,7 @@ void __init kaiser_init(void) if (vsyscall_enabled()) kaiser_add_user_map_early((void *)VSYSCALL_ADDR, PAGE_SIZE, - __PAGE_KERNEL_VSYSCALL); + vsyscall_pgprot); for_each_possible_cpu(cpu) { void *percpu_vaddr = __per_cpu_user_mapped_start + From 5f6c581bcb3cd5a248acbbf8a91930d13d6474f1 Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 8 Mar 2017 19:03:44 +0100 Subject: [PATCH 032/247] eventpoll.h: add missing epoll event masks commit 7e040726850a106587485c21bdacc0bfc8a0cbed upstream. [resend due to me forgetting to cc: linux-api the first time around I posted these back on Feb 23] From: Greg Kroah-Hartman For some reason these values are not in the uapi header file, so any libc has to define it themselves. To prevent them from needing to do this, just have the kernel provide the correct values. Reported-by: Elliott Hughes Signed-off-by: Greg Hackmann Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/eventpoll.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 1c3154913a391..bc96b14dfb2c5 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,6 +26,19 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Epoll event masks */ +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 + /* Set exclusive wakeup mode for the target file descriptor */ #define EPOLLEXCLUSIVE (1 << 28) From 5bb5ae9718f64c3fe09f2210a7a5628055529425 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 26 Jan 2018 15:14:16 +0300 Subject: [PATCH 033/247] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state [ Upstream commit dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 ] ccid2_hc_tx_rto_expire() timer callback always restarts the timer again and can run indefinitely (unless it is stopped outside), and after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from dccp_destroy_sock() to sk_destruct(), this started to happen quite often. The timer prevents releasing the socket, as a result, sk_destruct() won't be called. Found with LTP/dccp_ipsec tests running on the bonding device, which later couldn't be unloaded after the tests were completed: unregister_netdevice: waiting for bond0 to become free. Usage count = 148 Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/ccids/ccid2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 5e3a7302f7747..7753681195c15 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) From 8b0d3e81cdecb92b12c9d7924749a6f62f855790 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:06:42 +0000 Subject: [PATCH 034/247] ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL [ Upstream commit e9191ffb65d8e159680ce0ad2224e1acbde6985c ] Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl setting") removed the initialisation of ipv6_pinfo::autoflowlabel and added a second flag to indicate whether this field or the net namespace default should be used. The getsockopt() handling for this case was not updated, so it currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is not explicitly enabled. Fix it to return the effective value, whether that has been set at the socket or net namespace level. Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...") Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ipv6.h | 1 + net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 615ce0abba9cf..e64210c98c2b5 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -290,6 +290,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 388584b8ff31c..6d000c0001fa2 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -156,7 +156,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { if (!np->autoflowlabel_set) return ip6_default_np_autolabel(net); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 6e3871c7f8f76..bcea985dd76b1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1316,7 +1316,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; default: From fb50d8c9169ee0e90cd7a733d65cb86876d795c7 Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Wed, 10 Jan 2018 12:45:10 -0500 Subject: [PATCH 035/247] ipv6: fix udpv6 sendmsg crash caused by too small MTU [ Upstream commit 749439bfac6e1a2932c582e2699f91d329658196 ] The logic in __ip6_append_data() assumes that the MTU is at least large enough for the headers. A device's MTU may be adjusted after being added while sendmsg() is processing data, resulting in __ip6_append_data() seeing any MTU. For an mtu smaller than the size of the fragmentation header, the math results in a negative 'maxfraglen', which causes problems when refragmenting any previous skb in the skb_write_queue, leaving it possibly malformed. Instead sendmsg returns EINVAL when the mtu is calculated to be less than IPV6_MIN_MTU. Found by syzkaller: kernel BUG at ./include/linux/skbuff.h:2064! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d0b68580 task.stack: ffff8801ac6b8000 RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline] RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216 RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000 RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0 RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000 R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8 R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000 FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_finish_skb include/net/ipv6.h:911 [inline] udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093 udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x352/0x5a0 net/socket.c:1750 SyS_sendto+0x40/0x50 net/socket.c:1718 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9 RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005 RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69 R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000 Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570 RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570 Reported-by: syzbot Signed-off-by: Mike Maloney Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 6d000c0001fa2..af98bbe7af0f1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1260,14 +1260,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; if (dst_allfrag(rt->dst.path)) cork->base.flags |= IPCORK_ALLFRAG; From c2ceff11b46e6eed2a3138811864aaa645754127 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2018 22:31:18 -0800 Subject: [PATCH 036/247] ipv6: ip6_make_skb() needs to clear cork.base.dst [ Upstream commit 95ef498d977bf44ac094778fd448b98af158a3e6 ] In my last patch, I missed fact that cork.base.dst was not initialized in ip6_make_skb() : If ip6_setup_cork() returns an error, we might attempt a dst_release() on some random pointer. Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index af98bbe7af0f1..2e3db36198581 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1800,6 +1800,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); if (err) { From 283498b4ca3534777cd28d17ebffd2e8f3fb0716 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Mon, 15 Jan 2018 13:24:28 -0500 Subject: [PATCH 037/247] lan78xx: Fix failure in USB Full Speed [ Upstream commit a5b1379afbfabf91e3a689e82ac619a7157336b3 ] Fix initialize the uninitialized tx_qlen to an appropriate value when USB Full Speed is used. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/lan78xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 9c257ffedb159..c53385a0052f1 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2197,6 +2197,7 @@ static int lan78xx_reset(struct lan78xx_net *dev) buf = DEFAULT_BURST_CAP_SIZE / FS_USB_PKT_SIZE; dev->rx_urb_size = DEFAULT_BURST_CAP_SIZE; dev->rx_qlen = 4; + dev->tx_qlen = 4; } ret = lan78xx_write_reg(dev, BURST_CAP, buf); From 0ae16964f2154266f411b639ef101869ac3ae0b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 19 Jan 2018 11:50:46 +0100 Subject: [PATCH 038/247] net: igmp: fix source address check for IGMPv3 reports [ Upstream commit ad23b750933ea7bf962678972a286c78a8fa36aa ] Commit "net: igmp: Use correct source address on IGMPv3 reports" introduced a check to validate the source address of locally generated IGMPv3 packets. Instead of checking the local interface address directly, it uses inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the local subnet (or equal to the point-to-point address if used). This breaks for point-to-point interfaces, so check against ifa->ifa_local directly. Cc: Kevin Cernekee Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Reported-by: Sebastian Gottschall Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7bff0c65046f8..9c7a4cea1628b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -332,7 +332,7 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, return htonl(INADDR_ANY); for_ifa(in_dev) { - if (inet_ifa_match(fl4->saddr, ifa)) + if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } endfor_ifa(in_dev); From a44d91150f3365968f6d83abfb76c4dd92bda168 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2018 19:59:19 -0800 Subject: [PATCH 039/247] net: qdisc_pkt_len_init() should be more robust [ Upstream commit 7c68d1a6b4db9012790af7ac0f0fdc0d2083422a ] Without proper validation of DODGY packets, we might very well feed qdisc_pkt_len_init() with invalid GSO packets. tcp_hdrlen() might access out-of-bound data, so let's use skb_header_pointer() and proper checks. Whole story is described in commit d0c081b49137 ("flow_dissector: properly cap thoff field") We have the goal of validating DODGY packets earlier in the stack, so we might very well revert this fix in the future. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Jason Wang Reported-by: syzbot+9da69ebac7dddd804552@syzkaller.appspotmail.com Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 09007a71c8dd5..67b5d4d8acb1f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3083,10 +3083,21 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, From cf67be7a1a21c4d15593d7bacff5a59e30749b74 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 18 Jan 2018 16:14:26 -0500 Subject: [PATCH 040/247] net: tcp: close sock if net namespace is exiting [ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ] When a tcp socket is closed, if it detects that its net namespace is exiting, close immediately and do not wait for FIN sequence. For normal sockets, a reference is taken to their net namespace, so it will never exit while the socket is open. However, kernel sockets do not take a reference to their net namespace, so it may begin exiting while the kernel socket is still open. In this case if the kernel socket is a tcp socket, it will stay open trying to complete its close sequence. The sock's dst(s) hold a reference to their interface, which are all transferred to the namespace's loopback interface when the real interfaces are taken down. When the namespace tries to take down its loopback interface, it hangs waiting for all references to the loopback interface to release, which results in messages like: unregister_netdevice: waiting for lo to become free. Usage count = 1 These messages continue until the socket finally times out and closes. Since the net namespace cleanup holds the net_mutex while calling its registered pernet callbacks, any new net namespace initialization is blocked until the current net namespace finishes exiting. After this change, the tcp socket notices the exiting net namespace, and closes immediately, releasing its dst(s) and their reference to the loopback interface, which lets the net namespace continue exiting. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 Signed-off-by: Dan Streetman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/net_namespace.h | 10 ++++++++++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_timer.c | 15 +++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 0940598c002ff..23102da24dd93 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -213,6 +213,11 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } +static inline int check_net(const struct net *net) +{ + return atomic_read(&net->count) != 0; +} + void net_drop_ns(void *); #else @@ -237,6 +242,11 @@ int net_eq(const struct net *net1, const struct net *net2) return 1; } +static inline int check_net(const struct net *net) +{ + return 1; +} + #define net_drop_ns NULL #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 05d2bde008646..7efa6b062049a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2215,6 +2215,9 @@ void tcp_close(struct sock *sk, long timeout) tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74db43b479170..69523389f067a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * + * Also close if our net namespace is exiting; in that case there is no + * hope of ever communicating again since all netns interfaces are already + * down (or about to be down), and we need to release our dst references, + * which have been moved to the netns loopback interface, so the namespace + * can finish exiting. This condition is only possible if we are a kernel + * socket, as those do not hold references to the namespace. + * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. + * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } + + if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_done(sk); + return 1; + } + return 0; } From 1bd21b158e07e0b8c5a2ce832305a0ebfe42c480 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 22 Jan 2018 18:06:37 +0100 Subject: [PATCH 041/247] pppoe: take ->needed_headroom of lower device into account on xmit [ Upstream commit 02612bb05e51df8489db5e94d0cf8d1c81f87b0c ] In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom was probably fine before the introduction of ->needed_headroom in commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom"). But now, virtual devices typically advertise the size of their overhead in dev->needed_headroom, so we must also take it into account in skb_reserve(). Allocation size of skb is also updated to take dev->needed_tailroom into account and replace the arbitrary 32 bytes with the real size of a PPPoE header. This issue was discovered by syzbot, who connected a pppoe socket to a gre device which had dev->header_ops->create == ipgre_header and dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any headroom, and dev_hard_header() crashed when ipgre_header() tried to prepend its header to skb->data. skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24 head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted 4.15.0-rc7-next-20180115+ #97 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282 RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000 RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0 R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180 FS: 00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_under_panic net/core/skbuff.c:114 [inline] skb_push+0xce/0xf0 net/core/skbuff.c:1714 ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879 dev_hard_header include/linux/netdevice.h:2723 [inline] pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg+0xca/0x110 net/socket.c:640 sock_write_iter+0x31a/0x5d0 net/socket.c:909 call_write_iter include/linux/fs.h:1775 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 vfs_writev+0x18a/0x340 fs/read_write.c:977 do_writev+0xfc/0x2a0 fs/read_write.c:1012 SYSC_writev fs/read_write.c:1085 [inline] SyS_writev+0x27/0x30 fs/read_write.c:1082 entry_SYSCALL_64_fastpath+0x29/0xa0 Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like interfaces, but reserving space for ->needed_headroom is a more fundamental issue that needs to be addressed first. Same problem exists for __pppoe_xmit(), which also needs to take dev->needed_headroom into account in skb_cow_head(). Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom") Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Reviewed-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ppp/pppoe.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4ddae8118c856..dc36c2ec1d103 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -842,6 +842,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, struct pppoe_hdr *ph; struct net_device *dev; char *start; + int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { @@ -860,16 +861,16 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, if (total_len > (dev->mtu + dev->hard_header_len)) goto end; - - skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32, - 0, GFP_KERNEL); + hlen = LL_RESERVED_SPACE(dev); + skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; @@ -930,7 +931,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) /* Copy the data if there is no space for the header or if it's * read-only. */ - if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len)) + if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); From 0f51492d1bd5f7376d8175edd266b808d58df21f Mon Sep 17 00:00:00 2001 From: Francois Romieu Date: Fri, 26 Jan 2018 01:53:26 +0100 Subject: [PATCH 042/247] r8169: fix memory corruption on retrieval of hardware statistics. [ Upstream commit a78e93661c5fd30b9e1dee464b2f62f966883ef7 ] Hardware statistics retrieval hurts in tight invocation loops. Avoid extraneous write and enforce strict ordering of writes targeted to the tally counters dump area address registers. Signed-off-by: Francois Romieu Tested-by: Oliver Freyermuth Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 2c4350a1c6296..298b74ebc1e90 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -2222,19 +2222,14 @@ static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd) void __iomem *ioaddr = tp->mmio_addr; dma_addr_t paddr = tp->counters_phys_addr; u32 cmd; - bool ret; RTL_W32(CounterAddrHigh, (u64)paddr >> 32); + RTL_R32(CounterAddrHigh); cmd = (u64)paddr & DMA_BIT_MASK(32); RTL_W32(CounterAddrLow, cmd); RTL_W32(CounterAddrLow, cmd | counter_cmd); - ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); - - RTL_W32(CounterAddrLow, 0); - RTL_W32(CounterAddrHigh, 0); - - return ret; + return rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); } static bool rtl8169_reset_counters(struct net_device *dev) From 8e3534ea657e8e072e480b4a36c743458c5c852d Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:02:00 +0800 Subject: [PATCH 043/247] sctp: do not allow the v4 socket to bind a v4mapped v6 address [ Upstream commit c5006b8aa74599ce19104b31d322d2ea9ff887cc ] The check in sctp_sockaddr_af is not robust enough to forbid binding a v4mapped v6 addr on a v4 socket. The worse thing is that v4 socket's bind_verify would not convert this v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4 socket bound a v6 addr. This patch is to fix it by doing the common sa.sa_family check first, then AF_INET check for v4mapped v6 addrs. Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.") Reported-by: syzbot+7b7b518b1228d2743963@syzkaller.appspotmail.com Acked-by: Neil Horman Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7181ce6c62bf4..b9260d4029d81 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -332,16 +332,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (len < sizeof (struct sockaddr)) return NULL; + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + /* V4 mapped address are really of AF_INET family */ if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { - if (!opt->pf->af_supported(AF_INET, opt)) - return NULL; - } else { - /* Does this PF support this AF? */ - if (!opt->pf->af_supported(addr->sa.sa_family, opt)) - return NULL; - } + ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); From 2f056e7def4236bf5279da38410c93acee164d5a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:36 +0800 Subject: [PATCH 044/247] sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf [ Upstream commit a0ff660058b88d12625a783ce9e5c1371c87951f ] After commit cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep"), it may change to lock another sk if the asoc has been peeled off in sctp_wait_for_sndbuf. However, the asoc's new sk could be already closed elsewhere, as it's in the sendmsg context of the old sk that can't avoid the new sk's closing. If the sk's last one refcnt is held by this asoc, later on after putting this asoc, the new sk will be freed, while under it's own lock. This patch is to revert that commit, but fix the old issue by returning error under the old sk's lock. Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep") Reported-by: syzbot+ac6ea7baa4432811eb50@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b9260d4029d81..c472b8391dde4 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -83,7 +83,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk); + size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1956,7 +1956,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -7439,12 +7439,12 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk) + size_t msg_len) { struct sock *sk = asoc->base.sk; - int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); + int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); @@ -7473,17 +7473,13 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } out: - *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ From 0e52703d0746ee35326623c5442e9eec0139ffeb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 10 Jan 2018 12:50:25 -0800 Subject: [PATCH 045/247] tipc: fix a memory leak in tipc_nl_node_get_link() [ Upstream commit 59b36613e85fb16ebf9feaf914570879cd5c2a21 ] When tipc_node_find_by_name() fails, the nlmsg is not freed. While on it, switch to a goto label to properly free it. Fixes: be9c086715c ("tipc: narrow down exposure of struct tipc_node") Reported-by: Dmitry Vyukov Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Acked-by: Ying Xue Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/node.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 27753325e06e4..5b3e1ea37b6dd 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1848,36 +1848,38 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); - if (!node) - return -EINVAL; + if (!node) { + err = -EINVAL; + goto err_free; + } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); - nlmsg_free(msg.skb); - return -EINVAL; + err = -EINVAL; + goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); - if (err) { - nlmsg_free(msg.skb); - return err; - } + if (err) + goto err_free; } return genlmsg_reply(msg.skb, info); + +err_free: + nlmsg_free(msg.skb); + return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) From 66c16a22e3b141152c2bc85236b48372b2b1e984 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 22 Jan 2018 16:06:37 -0500 Subject: [PATCH 046/247] vmxnet3: repair memory leak [ Upstream commit 848b159835ddef99cc4193083f7e786c3992f580 ] with the introduction of commit b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info is improperly handled. While it is heap allocated when an rx queue is setup, and freed when torn down, an old line of code in vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0] being set to NULL prior to its being freed, causing a memory leak, which eventually exhausts the system on repeated create/destroy operations (for example, when the mtu of a vmxnet3 interface is changed frequently. Fix is pretty straight forward, just move the NULL set to after the free. Tested by myself with successful results Applies to net, and should likely be queued for stable, please Signed-off-by: Neil Horman Reported-By: boyang@redhat.com CC: boyang@redhat.com CC: Shrikrishna Khare CC: "VMware, Inc." CC: David S. Miller Acked-by: Shrikrishna Khare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index ef83ae3b0a44a..4afba17e24031 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1616,7 +1616,6 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, rq->rx_ring[i].basePA); rq->rx_ring[i].base = NULL; } - rq->buf_info[i] = NULL; } if (rq->data_ring.base) { @@ -1638,6 +1637,7 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, (rq->rx_ring[0].size + rq->rx_ring[1].size); dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0], rq->buf_info_pa); + rq->buf_info[0] = rq->buf_info[1] = NULL; } } From 014510b11781c7ba17b4e2c25720ce15233c712f Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:50 -0800 Subject: [PATCH 047/247] net: Allow neigh contructor functions ability to modify the primary_key [ Upstream commit 096b9854c04df86f03b38a97d40b6506e5730919 ] Use n->primary_key instead of pkey to account for the possibility that a neigh constructor function may have modified the primary_key value. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f45f6198851fa..7b315663f8408 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -496,7 +496,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); @@ -508,7 +508,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; From 260eb694b5a4bd3424a7c445128fb5a784e95e3d Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:51 -0800 Subject: [PATCH 048/247] ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY [ Upstream commit cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 ] Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices to avoid making an entry for every remote ip the device needs to talk to. This used the be the old behavior but became broken in a263b3093641f (ipv4: Make neigh lookups directly in output packet path) and later removed in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point devices) because it was broken. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/arp.h | 3 +++ net/ipv4/arp.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/net/arp.h b/include/net/arp.h index 5e0f891d476c2..1b3f869817578 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -19,6 +19,9 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = INADDR_ANY; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 51b27ae09fbd7..e60517eb1c3a5 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32 *)neigh->primary_key; + __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; + u32 inaddr_any = INADDR_ANY; + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); + + addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { From 00f9e47c6f9d9d25e1bf9cd5f58652d74e36d567 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 10 Jan 2018 16:24:45 +0100 Subject: [PATCH 049/247] ppp: unlock all_ppp_mutex before registering device [ Upstream commit 0171c41835591e9aa2e384b703ef9a6ae367c610 ] ppp_dev_uninit(), which is the .ndo_uninit() handler of PPP devices, needs to lock pn->all_ppp_mutex. Therefore we mustn't call register_netdevice() with pn->all_ppp_mutex already locked, or we'd deadlock in case register_netdevice() fails and calls .ndo_uninit(). Fortunately, we can unlock pn->all_ppp_mutex before calling register_netdevice(). This lock protects pn->units_idr, which isn't used in the device registration process. However, keeping pn->all_ppp_mutex locked during device registration did ensure that no device in transient state would be published in pn->units_idr. In practice, unlocking it before calling register_netdevice() doesn't change this property: ppp_unit_register() is called with 'ppp_mutex' locked and all searches done in pn->units_idr hold this lock too. Fixes: 8cb775bc0a34 ("ppp: fix device unregistration upon netns deletion") Reported-and-tested-by: syzbot+367889b9c9e279219175@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ppp/ppp_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index b883af93929c4..fc4c2ccc3d225 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1002,17 +1002,18 @@ static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set) if (!ifname_is_set) snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ppp->file.index); + mutex_unlock(&pn->all_ppp_mutex); + ret = register_netdevice(ppp->dev); if (ret < 0) goto err_unit; atomic_inc(&ppp_unit_count); - mutex_unlock(&pn->all_ppp_mutex); - return 0; err_unit: + mutex_lock(&pn->all_ppp_mutex); unit_put(&pn->units_idr, ppp->file.index); err: mutex_unlock(&pn->all_ppp_mutex); From 1711ba166e5f7148bc41b5e24f7f282ffc055515 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 19 Jan 2018 20:23:50 +0100 Subject: [PATCH 050/247] be2net: restore properly promisc mode after queues reconfiguration [ Upstream commit 52acf06451930eb4cefabd5ecea56e2d46c32f76 ] The commit 622190669403 ("be2net: Request RSS capability of Rx interface depending on number of Rx rings") modified be_update_queues() so the IFACE (HW representation of the netdevice) is destroyed and then re-created. This causes a regression because potential promiscuous mode is not restored properly during be_open() because the driver thinks that the HW has promiscuous mode already enabled. Note that Lancer is not affected by this bug because RX-filter flags are disabled during be_close() for this chipset. Cc: Sathya Perla Cc: Ajit Khaparde Cc: Sriharsha Basavapatna Cc: Somnath Kotur Fixes: 622190669403 ("be2net: Request RSS capability of Rx interface depending on number of Rx rings") Signed-off-by: Ivan Vecera Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/emulex/benet/be_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 1644896568c44..b2eeecb26939b 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4733,6 +4733,15 @@ int be_update_queues(struct be_adapter *adapter) be_schedule_worker(adapter); + /* + * The IF was destroyed and re-created. We need to clear + * all promiscuous flags valid for the destroyed IF. + * Without this promisc mode is not restored during + * be_open() because the driver thinks that it is + * already enabled in HW. + */ + adapter->if_flags &= ~BE_IF_FLAGS_ALL_PROMISCUOUS; + if (netif_running(netdev)) status = be_open(netdev); From cc99c6d59adf7daf62522b09e25af72267c997c8 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 18 Jan 2018 20:51:12 +0300 Subject: [PATCH 051/247] ip6_gre: init dev->mtu and dev->hard_header_len correctly [ Upstream commit 128bb975dc3c25d00de04e503e2fe0a780d04459 ] Commit b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions") moved dev->mtu initialization from ip6gre_tunnel_setup() to ip6gre_tunnel_init(), as a result, the previously set values, before ndo_init(), are reset in the following cases: * rtnl_create_link() can update dev->mtu from IFLA_MTU parameter. * ip6gre_tnl_link_config() is invoked before ndo_init() in netlink and ioctl setup, so ndo_init() can reset MTU adjustments with the lower device MTU as well, dev->mtu and dev->hard_header_len. Not applicable for ip6gretap because it has one more call to ip6gre_tnl_link_config(tunnel, 1) in ip6gre_tap_init(). Fix the first case by updating dev->mtu with 'tb[IFLA_MTU]' parameter if a user sets it manually on a device creation, and fix the second one by moving ip6gre_tnl_link_config() call after register_netdevice(). Fixes: b05229f44228 ("gre6: Cleanup GREv6 transmit path, call common GRE functions") Fixes: db2ec95d1ba4 ("ip6_gre: Fix MTU setting") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_gre.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c46066c5dc271..db2613b4a0496 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -337,11 +337,12 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, 1); if (register_netdevice(dev) < 0) goto failed_free; + ip6gre_tnl_link_config(nt, 1); + /* Can use a lockless transmit, unless we generate output sequences */ if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; @@ -1263,7 +1264,6 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { - struct ip6_tnl *tunnel; int ret; ret = ip6gre_tunnel_init_common(dev); @@ -1272,10 +1272,6 @@ static int ip6gre_tap_init(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); - - ip6gre_tnl_link_config(tunnel, 1); - return 0; } @@ -1370,7 +1366,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->dev = dev; nt->net = dev_net(dev); - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; @@ -1396,6 +1391,11 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, if (err) goto out; + ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); + + if (tb[IFLA_MTU]) + ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + dev_hold(dev); ip6gre_tunnel_link(ign, nt); From 3110e2134c9718a1f3c091873769d1d9b621bb87 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 19 Jan 2018 09:29:18 -0500 Subject: [PATCH 052/247] gso: validate gso_type in GSO handlers [ Upstream commit 121d57af308d0cf943f08f4738d24d3966c38cd9 ] Validate gso_type during segmentation as SKB_GSO_DODGY sources may pass packets where the gso_type does not match the contents. Syzkaller was able to enter the SCTP gso handler with a packet of gso_type SKB_GSO_TCPV4. On entry of transport layer gso handlers, verify that the gso_type matches the transport protocol. Fixes: 90017accff61 ("sctp: Add GSO support") Link: http://lkml.kernel.org/r/<001a1137452496ffc305617e5fe0@google.com> Reported-by: syzbot+fee64147a25aecd48055@syzkaller.appspotmail.com Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_offload.c | 3 +++ net/ipv4/udp_offload.c | 3 +++ net/ipv6/tcpv6_offload.c | 3 +++ net/ipv6/udp_offload.c | 3 +++ net/sctp/offload.c | 3 +++ 5 files changed, 15 insertions(+) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index bc68da38ea869..366b1becff9d4 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -32,6 +32,9 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6401574cd638a..f4f616eaaeb81 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -205,6 +205,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index d883c9204c01d..278e49cd67d4e 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -46,6 +46,9 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, { struct tcphdr *th; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)) + return ERR_PTR(-EINVAL); + if (!pskb_may_pull(skb, sizeof(*th))) return ERR_PTR(-EINVAL); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e7d378c032cb6..2bd2087bd1051 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -55,6 +55,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, const struct ipv6hdr *ipv6h; struct udphdr *uh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + goto out; + if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 4f5a2b580aa52..6300f28c95888 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -44,6 +44,9 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP)) + goto out; + sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; From 1105145cb3d5eee496de1d55aaea160c78e1c5b5 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 24 Jan 2018 10:02:09 +0100 Subject: [PATCH 053/247] mlxsw: spectrum_router: Don't log an error on missing neighbor [ Upstream commit 1ecdaea02ca6bfacf2ecda500dc1af51e9780c42 ] Driver periodically samples all neighbors configured in device in order to update the kernel regarding their state. When finding an entry configured in HW that doesn't show in neigh_lookup() driver logs an error message. This introduces a race when removing multiple neighbors - it's possible that a given entry would still be configured in HW as its removal is still being processed but is already removed from the kernel's neighbor tables. Simply remove the error message and gracefully accept such events. Fixes: c723c735fa6b ("mlxsw: spectrum_router: Periodically update the kernel's neigh table") Fixes: 60f040ca11b9 ("mlxsw: spectrum_router: Periodically dump active IPv6 neighbours") Signed-off-by: Yuval Mintz Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 8aa91ddff2876..16556011d5711 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -765,11 +765,8 @@ static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp, dipn = htonl(dip); dev = mlxsw_sp->rifs[rif]->dev; n = neigh_lookup(&arp_tbl, &dipn, dev); - if (!n) { - netdev_err(dev, "Failed to find matching neighbour for IP=%pI4h\n", - &dip); + if (!n) return; - } netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip); neigh_event_send(n, NULL); From 18717ee28ef5c0285ff969d1e9357529a8a9233f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 15 Jan 2018 11:37:29 -0800 Subject: [PATCH 054/247] tun: fix a memory leak for tfile->tx_array [ Upstream commit 4df0bfc79904b7169dc77dcce44598b1545721f9 ] tfile->tun could be detached before we close the tun fd, via tun_detach_all(), so it should not be used to check for tfile->tx_array. As Jason suggested, we probably have to clean it up unconditionally both in __tun_deatch() and tun_detach_all(), but this requires to check if it is initialized or not. Currently skb_array_cleanup() doesn't have such a check, so I check it in the caller and introduce a helper function, it is a bit ugly but we can always improve it in net-next. Reported-by: Dmitry Vyukov Fixes: 1576d9860599 ("tun: switch to use skb array for tx") Cc: Jason Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/tun.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 518cbfbc8b652..eb6dc28e5e521 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -525,6 +525,14 @@ static void tun_queue_purge(struct tun_file *tfile) skb_queue_purge(&tfile->sk.sk_error_queue); } +static void tun_cleanup_tx_array(struct tun_file *tfile) +{ + if (tfile->tx_array.ring.queue) { + skb_array_cleanup(&tfile->tx_array); + memset(&tfile->tx_array, 0, sizeof(tfile->tx_array)); + } +} + static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; @@ -566,8 +574,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } - if (tun) - skb_array_cleanup(&tfile->tx_array); + tun_cleanup_tx_array(tfile); sock_put(&tfile->sk); } } @@ -606,11 +613,13 @@ static void tun_detach_all(struct net_device *dev) /* Drop read queue */ tun_queue_purge(tfile); sock_put(&tfile->sk); + tun_cleanup_tx_array(tfile); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_enable_queue(tfile); tun_queue_purge(tfile); sock_put(&tfile->sk); + tun_cleanup_tx_array(tfile); } BUG_ON(tun->numdisabled != 0); @@ -2363,6 +2372,8 @@ static int tun_chr_open(struct inode *inode, struct file * file) sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); + memset(&tfile->tx_array, 0, sizeof(tfile->tx_array)); + return 0; } From eecfa2eeefe3caef4c8d6b2284f6066c76ab26a1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Jan 2018 14:21:13 -0800 Subject: [PATCH 055/247] flow_dissector: properly cap thoff field [ Upstream commit d0c081b49137cd3200f2023c0875723be66e7ce5 ] syzbot reported yet another crash [1] that is caused by insufficient validation of DODGY packets. Two bugs are happening here to trigger the crash. 1) Flow dissection leaves with incorrect thoff field. 2) skb_probe_transport_header() sets transport header to this invalid thoff, even if pointing after skb valid data. 3) qdisc_pkt_len_init() reads out-of-bound data because it trusts tcp_hdrlen(skb) Possible fixes : - Full flow dissector validation before injecting bad DODGY packets in the stack. This approach was attempted here : https://patchwork.ozlabs.org/patch/ 861874/ - Have more robust functions in the core. This might be needed anyway for stable versions. This patch fixes the flow dissection issue. [1] CPU: 1 PID: 3144 Comm: syzkaller271204 Not tainted 4.15.0-rc4-mm1+ #49 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:355 [inline] kasan_report+0x23b/0x360 mm/kasan/report.c:413 __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:432 __tcp_hdrlen include/linux/tcp.h:35 [inline] tcp_hdrlen include/linux/tcp.h:40 [inline] qdisc_pkt_len_init net/core/dev.c:3160 [inline] __dev_queue_xmit+0x20d3/0x2200 net/core/dev.c:3465 dev_queue_xmit+0x17/0x20 net/core/dev.c:3554 packet_snd net/packet/af_packet.c:2943 [inline] packet_sendmsg+0x3ad5/0x60a0 net/packet/af_packet.c:2968 sock_sendmsg_nosec net/socket.c:628 [inline] sock_sendmsg+0xca/0x110 net/socket.c:638 sock_write_iter+0x31a/0x5d0 net/socket.c:907 call_write_iter include/linux/fs.h:1776 [inline] new_sync_write fs/read_write.c:469 [inline] __vfs_write+0x684/0x970 fs/read_write.c:482 vfs_write+0x189/0x510 fs/read_write.c:544 SYSC_write fs/read_write.c:589 [inline] SyS_write+0xef/0x220 fs/read_write.c:581 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 34fad54c2537 ("net: __skb_flow_dissect() must cap its return value") Fixes: a6e544b0a88b ("flow_dissector: Jump to exit code in __skb_flow_dissect") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/flow_dissector.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 32e4e0158846b..862d63ec56e40 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -550,8 +550,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb, out_good: ret = true; - key_control->thoff = (u16)nhoff; out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; @@ -559,7 +559,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, out_bad: ret = false; - key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); goto out; } EXPORT_SYMBOL(__skb_flow_dissect); From dc1932c69835928dcb6259c744043dd03028cdee Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Mon, 22 Jan 2018 14:12:52 +0800 Subject: [PATCH 056/247] perf/x86/amd/power: Do not load AMD power module on !AMD platforms commit 40d4071ce2d20840d224b4a77b5dc6f752c9ab15 upstream. The AMD power module can be loaded on non AMD platforms, but unload fails with the following Oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: __list_del_entry_valid+0x29/0x90 Call Trace: perf_pmu_unregister+0x25/0xf0 amd_power_pmu_exit+0x1c/0xd23 [power] SyS_delete_module+0x1a8/0x2b0 ? exit_to_usermode_loop+0x8f/0xb0 entry_SYSCALL_64_fastpath+0x20/0x83 Return -ENODEV instead of 0 from the module init function if the CPU does not match. Fixes: c7ab62bfbe0e ("perf/x86/amd/power: Add AMD accumulated power reporting mechanism") Signed-off-by: Xiao Liang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20180122061252.6394-1-xiliang@redhat.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/amd/power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 9842270ed2f20..21a4e4127f434 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -277,7 +277,7 @@ static int __init amd_power_pmu_init(void) int ret; if (!x86_match_cpu(cpu_match)) - return 0; + return -ENODEV; if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) return -ENODEV; From 9f3a6cadf494f378b7839de14063793259ad5626 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Tue, 23 Jan 2018 11:41:32 +0100 Subject: [PATCH 057/247] x86/microcode/intel: Extend BDW late-loading further with LLC size check commit 7e702d17ed138cf4ae7c00e8c00681ed464587c7 upstream. Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") reduced the impact of erratum BDF90 for Broadwell model 79. The impact can be reduced further by checking the size of the last level cache portion per core. Tony: "The erratum says the problem only occurs on the large-cache SKUs. So we only need to avoid the update if we are on a big cache SKU that is also running old microcode." For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ac3e636ad586f..f90f17610f627 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -40,6 +40,9 @@ #include #include +/* last level cache size per core */ +static int llc_size_per_core; + /* * Temporary microcode blobs pointers storage. We note here during early load * the pointers to microcode blobs we've got from whatever storage (detached @@ -1053,12 +1056,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 - * may result in a system hang. This behavior is documented in item - * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). */ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X && c->x86_mask == 0x01 && + llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); @@ -1125,6 +1130,15 @@ static struct microcode_ops microcode_intel_ops = { .microcode_fini_cpu = microcode_fini_cpu, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -1135,6 +1149,8 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } From c98ff7299b404f110167883695f81080723e6e15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jan 2018 14:54:32 +0100 Subject: [PATCH 058/247] hrtimer: Reset hrtimer cpu base proper on CPU hotplug commit d5421ea43d30701e03cadc56a38854c36a8b4433 upstream. The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos Signed-off-by: Greg Kroah-Hartman --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index eeb7f2f5698d5..54fd2fed36e9c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,7 +652,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next.tv64 = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1610,6 +1612,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); return 0; From c964ad34f6d9c5c907e5cc2f1a855d044346aa8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Jan 2018 02:48:54 +0100 Subject: [PATCH 059/247] x86: bpf_jit: small optimization in emit_bpf_tail_call() [ upstream commit 84ccac6e7854ebbfb56d2fc6d5bef9be49bb304c ] Saves 4 bytes replacing following instructions : lea rax, [rsi + rdx * 8 + offsetof(...)] mov rax, qword ptr [rax] cmp rax, 0 by : mov rax, [rsi + rdx * 8 + offsetof(...)] test rax, rax Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/x86/net/bpf_jit_comp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 15f7436159236..ece29e27faa06 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -281,7 +281,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ offsetof(struct bpf_array, map.max_entries)); EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 47 /* number of bytes to jump */ +#define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; @@ -290,21 +290,20 @@ static void emit_bpf_tail_call(u8 **pprog) */ EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 36 +#define OFFSET2 32 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) * goto out; */ - EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 10 EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; From 5226bb3b95515d7f6f2e1c11ac78b612e0056342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Jan 2018 02:48:55 +0100 Subject: [PATCH 060/247] bpf: fix bpf_tail_call() x64 JIT [ upstream commit 90caccdd8cc0215705f18b92771b449b01e2474a ] - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/x86/net/bpf_jit_comp.c | 4 ++-- kernel/bpf/core.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ece29e27faa06..7840331d30568 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -278,9 +278,9 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ #define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa6d981541067..ab9576b3bde57 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -715,7 +715,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; From a3d6dd6a66c1bf01a36926705db4687c7d0d4734 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Jan 2018 02:48:56 +0100 Subject: [PATCH 061/247] bpf: introduce BPF_JIT_ALWAYS_ON config [ upstream commit 290af86629b25ffd1ed6232c4e9107da031705cb ] The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- init/Kconfig | 7 +++++++ kernel/bpf/core.c | 18 ++++++++++++++++++ lib/test_bpf.c | 11 +++++++---- net/core/filter.c | 6 ++---- net/core/sysctl_net_core.c | 6 ++++++ net/socket.c | 9 +++++++++ 6 files changed, 49 insertions(+), 8 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 34407f15e6d34..b331feeabda42 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1609,6 +1609,13 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + config SHMEM bool "Use full shmem filesystem" if EXPERT default y diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab9576b3bde57..64c4b13952f0f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -458,6 +458,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -923,6 +924,13 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) } STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ +#else +static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -970,7 +978,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON fp->bpf_func = (void *) __bpf_prog_run; +#else + fp->bpf_func = (void *) __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -979,6 +991,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * be JITed, but falls back to the interpreter. */ fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 2e385026915c1..98da7520a6aae 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -5646,9 +5646,8 @@ static struct bpf_prog *generate_filter(int which, int *err) return NULL; } } - /* We don't expect to fail. */ if (*err) { - pr_cont("FAIL to attach err=%d len=%d\n", + pr_cont("FAIL to prog_create err=%d len=%d\n", *err, fprog.len); return NULL; } @@ -5671,6 +5670,10 @@ static struct bpf_prog *generate_filter(int which, int *err) * checks. */ fp = bpf_prog_select_runtime(fp, err); + if (*err) { + pr_cont("FAIL to select_runtime err=%d\n", *err); + return NULL; + } break; } @@ -5856,8 +5859,8 @@ static __init int test_bpf(void) pass_cnt++; continue; } - - return err; + err_cnt++; + continue; } pr_cont("jited:%u ", fp->jited); diff --git a/net/core/filter.c b/net/core/filter.c index 4eb4ce0aeef49..6e0d17bc8382e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1005,11 +1005,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - /* We are guaranteed to never error here with cBPF to eBPF - * transitions, since there's no issue with type compatibility - * checks on program arrays. - */ fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; kfree(old_prog); return fp; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a7f05f0130e83..1b4619008c4ee 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -292,7 +292,13 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, +#ifndef CONFIG_BPF_JIT_ALWAYS_ON .proc_handler = proc_dointvec +#else + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, +#endif }, # ifdef CONFIG_HAVE_EBPF_JIT { diff --git a/net/socket.c b/net/socket.c index 05f13b24572c5..bd3b33988ee01 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2548,6 +2548,15 @@ static int __init sock_init(void) core_initcall(sock_init); /* early initcall */ +static int __init jit_init(void) +{ +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + bpf_jit_enable = 1; +#endif + return 0; +} +pure_initcall(jit_init); + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { From fcabc6d008856356258f86e96bfcf3806acf9f38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 29 Jan 2018 02:48:57 +0100 Subject: [PATCH 062/247] bpf: arsh is not supported in 32 bit alu thus reject it [ upstream commit 7891a87efc7116590eaba57acc3c422487802c6f ] The following snippet was throwing an 'unknown opcode cc' warning in BPF interpreter: 0: (18) r0 = 0x0 2: (7b) *(u64 *)(r10 -16) = r0 3: (cc) (u32) r0 s>>= (u32) r0 4: (95) exit Although a number of JITs do support BPF_ALU | BPF_ARSH | BPF_{K,X} generation, not all of them do and interpreter does neither. We can leave existing ones and implement it later in bpf-next for the remaining ones, but reject this properly in verifier for the time being. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Reported-by: syzbot+93c4904c5c70348a6890@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 19c44cf59bb23..8e0bf890ffb7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1843,6 +1843,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose("BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; From 5cb917aa1f1e03df9a4c29b363e3900d73508fa8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 29 Jan 2018 02:48:58 +0100 Subject: [PATCH 063/247] bpf: avoid false sharing of map refcount with max_entries [ upstream commit be95a845cc4402272994ce290e3ad928aff06cb9 ] In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds speculation") also change the layout of struct bpf_map such that false sharing of fast-path members like max_entries is avoided when the maps reference counter is altered. Therefore enforce them to be placed into separate cachelines. pahole dump after change: struct bpf_map { const struct bpf_map_ops * ops; /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ u32 pages; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ bool unpriv_array; /* 56 1 */ /* XXX 7 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct user_struct * user; /* 64 8 */ atomic_t refcnt; /* 72 4 */ atomic_t usercnt; /* 76 4 */ struct work_struct work; /* 80 32 */ char name[16]; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ /* size: 128, cachelines: 2, members: 17 */ /* sum members: 121, holes: 1, sum holes: 7 */ }; Now all entries in the first cacheline are read only throughout the life time of the map, set up once during map creation. Overall struct size and number of cachelines doesn't change from the reordering. struct bpf_map is usually first member and embedded in map structs in specific map implementations, so also avoid those members to sit at the end where it could potentially share the cacheline with first map values e.g. in the array since remote CPUs could trigger map updates just as well for those (easily dirtying members like max_entries intentionally as well) while having subsequent values in cache. Quoting from Google's Project Zero blog [1]: Additionally, at least on the Intel machine on which this was tested, bouncing modified cache lines between cores is slow, apparently because the MESI protocol is used for cache coherence [8]. Changing the reference counter of an eBPF array on one physical CPU core causes the cache line containing the reference counter to be bounced over to that CPU core, making reads of the reference counter on all other CPU cores slow until the changed reference counter has been written back to memory. Because the length and the reference counter of an eBPF array are stored in the same cache line, this also means that changing the reference counter on one physical CPU core causes reads of the eBPF array's length to be slow on other physical CPU cores (intentional false sharing). While this doesn't 'control' the out-of-bounds speculation through masking the index as in commit b2157399cc98, triggering a manipulation of the map's reference counter is really trivial, so lets not allow to easily affect max_entries from it. Splitting to separate cachelines also generally makes sense from a performance perspective anyway in that fast-path won't have a cache miss if the map gets pinned, reused in other progs, etc out of control path, thus also avoids unintentional false sharing. [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75ffd3b2149e9..7995940d41877 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,7 +36,10 @@ struct bpf_map_ops { }; struct bpf_map { - atomic_t refcnt; + /* 1st cacheline with read-mostly members of which some + * are also accessed in fast-path (e.g. ops, max_entries). + */ + const struct bpf_map_ops *ops ____cacheline_aligned; enum bpf_map_type map_type; u32 key_size; u32 value_size; @@ -44,10 +47,15 @@ struct bpf_map { u32 map_flags; u32 pages; bool unpriv_array; - struct user_struct *user; - const struct bpf_map_ops *ops; - struct work_struct work; + /* 7 bytes hole */ + + /* 2nd cacheline with misc members to avoid false sharing + * particularly with refcounting. + */ + struct user_struct *user ____cacheline_aligned; + atomic_t refcnt; atomic_t usercnt; + struct work_struct work; }; struct bpf_map_type_list { From 4606077802f2c6ef7aff5185d9f7d99a50784ffd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Jan 2018 02:48:59 +0100 Subject: [PATCH 064/247] bpf: fix divides by zero [ upstream commit c366287ebd698ef5e3de300d90cd62ee9ee7373e ] Divides by zero are not nice, lets avoid them if possible. Also do_div() seems not needed when dealing with 32bit operands, but this seems a minor detail. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 64c4b13952f0f..879ca844ba1d3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -642,7 +642,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -661,7 +661,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); From 265d7657c9baf09d57eb386d0374e912e9649626 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Jan 2018 02:49:00 +0100 Subject: [PATCH 065/247] bpf: fix 32-bit divide by zero [ upstream commit 68fda450a7df51cff9e5a4d4a4d9d0d5f2589153 ] due to some JITs doing if (src_reg == 0) check in 64-bit mode for div/mod operations mask upper 32-bits of src register before doing the check Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") Reported-by: syzbot+48340bb518e88849e2e3@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 18 ++++++++++++++++++ net/core/filter.c | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e0bf890ffb7e..5c56ff9eeed25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3391,6 +3391,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; diff --git a/net/core/filter.c b/net/core/filter.c index 6e0d17bc8382e..e8c89d2d2bc04 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -441,6 +441,10 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, convert_bpf_extensions(fp, &insn)) break; + if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; From f531fbb06a56361d4a9807bbb16db4facc8537d3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 29 Jan 2018 02:49:01 +0100 Subject: [PATCH 066/247] bpf: reject stores into ctx via st and xadd [ upstream commit f37a8cb84cce18762e8f86a70bd6a49a66ab964c ] Alexei found that verifier does not reject stores into context via BPF_ST instead of BPF_STX. And while looking at it, we also should not allow XADD variant of BPF_STX. The context rewriter is only assuming either BPF_LDX_MEM- or BPF_STX_MEM-type operations, thus reject anything other than that so that assumptions in the rewriter properly hold. Add test cases as well for BPF selftests. Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c56ff9eeed25..076e4a0ff95e7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -702,6 +702,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = &env->cur_state.regs[regno]; + + return reg->type == PTR_TO_CTX; +} + static int check_ptr_alignment(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size) { @@ -896,6 +903,12 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose("BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -3012,6 +3025,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose("BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, From f12d0602633decf073796f3aaa59eec7ff2da9e2 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:11:06 +0000 Subject: [PATCH 067/247] nfsd: auth: Fix gid sorting when rootsquash enabled commit 1995266727fa8143897e89b55f5d3c79aa828420 upstream. Commit bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility group_info allocators") appears to break nfsd rootsquash in a pretty major way. It adds a call to groups_sort() inside the loop that copies/squashes gids, which means the valid gids are sorted along with the following garbage. The net result is that the highest numbered valid gids are replaced with any lower-valued garbage gids, possibly including 0. We should sort only once, after filling in all the gids. Fixes: bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility ...") Signed-off-by: Ben Hutchings Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds Cc: Wolfgang Walter Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/auth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 75f942ae51765..81c018e5c31e5 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -59,10 +59,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; - - /* Each thread allocates its own gi, no race */ - groups_sort(gi); } + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } else { gi = get_group_info(rqgi); } From 6c6f924f9c6294944ee6efb1bbd8cdb853582e50 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 31 Jan 2018 12:55:57 +0100 Subject: [PATCH 068/247] Linux 4.9.79 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8a6f158a11768..4a7e6dff1c2e7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 78 +SUBLEVEL = 79 EXTRAVERSION = NAME = Roaring Lionus From 56bc086358cac1a2949783646eabd57447b9d672 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jan 2018 16:26:00 -0800 Subject: [PATCH 069/247] loop: fix concurrent lo_open/lo_release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 upstream. 范龙飞 reports that KASAN can report a use-after-free in __lock_acquire. The reason is due to insufficient serialization in lo_release(), which will continue to use the loop device even after it has decremented the lo_refcnt to zero. In the meantime, another process can come in, open the loop device again as it is being shut down. Confusion ensues. Reported-by: 范龙飞 Signed-off-by: Linus Torvalds Signed-off-by: Jens Axboe Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/block/loop.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 24d6cefceb327..402254d262474 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1558,9 +1558,8 @@ static int lo_open(struct block_device *bdev, fmode_t mode) return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void __lo_release(struct loop_device *lo) { - struct loop_device *lo = disk->private_data; int err; if (atomic_dec_return(&lo->lo_refcnt)) @@ -1586,6 +1585,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&lo->lo_ctl_mutex); } +static void lo_release(struct gendisk *disk, fmode_t mode) +{ + mutex_lock(&loop_index_mutex); + __lo_release(disk->private_data); + mutex_unlock(&loop_index_mutex); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, From 20e6f5bdf542cb2510bd54e69eed6fe395f6c026 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 28 May 2017 10:53:10 +0300 Subject: [PATCH 070/247] net/mlx5: Define interface bits for fencing UMR wqe commit 1410a90ae449061b7e1ae19d275148f36948801b upstream. HW can implement UMR wqe re-transmission in various ways. Thus, add HCA cap to distinguish the needed fence for UMR to make sure that the wqe wouldn't fail on mkey checks. Signed-off-by: Max Gurtovoy Acked-by: Leon Romanovsky Reviewed-by: Christoph Hellwig Signed-off-by: Doug Ledford Cc: Marta Rybczynska Signed-off-by: Greg Kroah-Hartman --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 25ed105bbcfb4..20ee90c47cd50 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -737,6 +737,12 @@ enum { MLX5_CAP_PORT_TYPE_ETH = 0x1, }; +enum { + MLX5_CAP_UMR_FENCE_STRONG = 0x0, + MLX5_CAP_UMR_FENCE_SMALL = 0x1, + MLX5_CAP_UMR_FENCE_NONE = 0x2, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x80]; @@ -838,7 +844,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 striding_rq[0x1]; u8 reserved_at_201[0x2]; u8 ipoib_basic_offloads[0x1]; - u8 reserved_at_205[0xa]; + u8 reserved_at_205[0x5]; + u8 umr_fence[0x2]; + u8 reserved_at_20c[0x3]; u8 drain_sigerr[0x1]; u8 cmdif_checksum[0x2]; u8 sigerr_cqe[0x1]; From 2a7076e71575b741f61f33f93be5ea284242a603 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 28 May 2017 10:53:11 +0300 Subject: [PATCH 071/247] RDMA/mlx5: set UMR wqe fence according to HCA cap commit 6e8484c5cf07c7ee632587e98c1a12d319dacb7c upstream. Cache the needed umr_fence and set the wqe ctrl segmennt accordingly. Signed-off-by: Max Gurtovoy Acked-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford Cc: Marta Rybczynska Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/main.c | 14 +++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +- drivers/infiniband/hw/mlx5/qp.c | 59 +++++++++++----------------- 3 files changed, 39 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a2120ff0ef4c9..5e29fbd3a5a0b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2575,6 +2575,18 @@ static int create_umr_res(struct mlx5_ib_dev *dev) return ret; } +static u8 mlx5_get_umr_fence(u8 umr_fence_cap) +{ + switch (umr_fence_cap) { + case MLX5_CAP_UMR_FENCE_NONE: + return MLX5_FENCE_MODE_NONE; + case MLX5_CAP_UMR_FENCE_SMALL: + return MLX5_FENCE_MODE_INITIATOR_SMALL; + default: + return MLX5_FENCE_MODE_STRONG_ORDERING; + } +} + static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; @@ -3101,6 +3113,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) mlx5_ib_internal_fill_odp_caps(dev); + dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence)); + if (MLX5_CAP_GEN(mdev, imaicl)) { dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 86e1e08125ff2..d5cc954e8ac2d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -345,7 +345,7 @@ struct mlx5_ib_qp { struct mlx5_ib_wq rq; u8 sq_signal_bits; - u8 fm_cache; + u8 next_fence; struct mlx5_ib_wq sq; /* serialize qp state modifications @@ -643,6 +643,7 @@ struct mlx5_ib_dev { struct list_head qp_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; + u8 umr_fence; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2665414b48750..fdd156101a72a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3755,24 +3755,6 @@ static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, } } -static u8 get_fence(u8 fence, struct ib_send_wr *wr) -{ - if (unlikely(wr->opcode == IB_WR_LOCAL_INV && - wr->send_flags & IB_SEND_FENCE)) - return MLX5_FENCE_MODE_STRONG_ORDERING; - - if (unlikely(fence)) { - if (wr->send_flags & IB_SEND_FENCE) - return MLX5_FENCE_MODE_SMALL_AND_FENCE; - else - return fence; - } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) { - return MLX5_FENCE_MODE_FENCE; - } - - return 0; -} - static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, struct mlx5_wqe_ctrl_seg **ctrl, struct ib_send_wr *wr, unsigned *idx, @@ -3801,8 +3783,7 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, static void finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, u8 size, unsigned idx, u64 wr_id, - int nreq, u8 fence, u8 next_fence, - u32 mlx5_opcode) + int nreq, u8 fence, u32 mlx5_opcode) { u8 opmod = 0; @@ -3810,7 +3791,6 @@ static void finish_wqe(struct mlx5_ib_qp *qp, mlx5_opcode | ((u32)opmod << 24)); ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); ctrl->fm_ce_se |= fence; - qp->fm_cache = next_fence; if (unlikely(qp->wq_sig)) ctrl->signature = wq_sig(ctrl); @@ -3870,7 +3850,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } - fence = qp->fm_cache; num_sge = wr->num_sge; if (unlikely(num_sge > qp->sq.max_gs)) { mlx5_ib_warn(dev, "\n"); @@ -3887,6 +3866,19 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } + if (wr->opcode == IB_WR_LOCAL_INV || + wr->opcode == IB_WR_REG_MR) { + fence = dev->umr_fence; + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + } else if (wr->send_flags & IB_SEND_FENCE) { + if (qp->next_fence) + fence = MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + fence = MLX5_FENCE_MODE_FENCE; + } else { + fence = qp->next_fence; + } + switch (ibqp->qp_type) { case IB_QPT_XRC_INI: xrc = seg; @@ -3913,7 +3905,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; case IB_WR_LOCAL_INV: - next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); set_linv_wr(qp, &seg, &size); @@ -3921,7 +3912,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case IB_WR_REG_MR: - next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; qp->sq.wr_data[idx] = IB_WR_REG_MR; ctrl->imm = cpu_to_be32(reg_wr(wr)->key); err = set_reg_wr(qp, reg_wr(wr), &seg, &size); @@ -3944,9 +3934,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } - finish_wqe(qp, ctrl, size, idx, wr->wr_id, - nreq, get_fence(fence, wr), - next_fence, MLX5_OPCODE_UMR); + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_UMR); /* * SET_PSV WQEs are not signaled and solicited * on error @@ -3971,9 +3960,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } - finish_wqe(qp, ctrl, size, idx, wr->wr_id, - nreq, get_fence(fence, wr), - next_fence, MLX5_OPCODE_SET_PSV); + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_SET_PSV); err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); if (err) { @@ -3983,7 +3971,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } - next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire, mr->sig->psv_wire.psv_idx, &seg, &size); @@ -3993,9 +3980,9 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, goto out; } - finish_wqe(qp, ctrl, size, idx, wr->wr_id, - nreq, get_fence(fence, wr), - next_fence, MLX5_OPCODE_SET_PSV); + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_SET_PSV); + qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; num_sge = 0; goto skip_psv; @@ -4100,8 +4087,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } } - finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, - get_fence(fence, wr), next_fence, + qp->next_fence = next_fence; + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, fence, mlx5_ib_opcode[wr->opcode]); skip_psv: if (0) From efe3f94f83d2b03abf2fc071ec59bc668761bac5 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 21 Dec 2017 11:11:31 +1030 Subject: [PATCH 072/247] tools/gpio: Fix build error with musl libc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1696784eb7b52b13b62d160c028ef2c2c981d4f2 upstream. The GPIO tools build fails when using a buildroot toolchain that uses musl as it's C library: arm-broomstick-linux-musleabi-gcc -Wp,-MD,./.gpio-event-mon.o.d \ -Wp,-MT,gpio-event-mon.o -O2 -Wall -g -D_GNU_SOURCE \ -Iinclude -D"BUILD_STR(s)=#s" -c -o gpio-event-mon.o gpio-event-mon.c gpio-event-mon.c:30:6: error: unknown type name ‘u_int32_t’; did you mean ‘uint32_t’? u_int32_t handleflags, ^~~~~~~~~ uint32_t The glibc headers installed on my laptop include sys/types.h in unistd.h, but it appears that musl does not. Fixes: 97f69747d8b1 ("tools/gpio: add the gpio-event-mon tool") Signed-off-by: Joel Stanley Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- tools/gpio/gpio-event-mon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/gpio/gpio-event-mon.c b/tools/gpio/gpio-event-mon.c index 1c14c25951583..4b36323ea64bb 100644 --- a/tools/gpio/gpio-event-mon.c +++ b/tools/gpio/gpio-event-mon.c @@ -23,6 +23,7 @@ #include #include #include +#include #include int monitor_device(const char *device_name, From 241c04f75e90b6e0eaf099d53c68a77ed5767835 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Fri, 12 Jan 2018 13:16:08 +0100 Subject: [PATCH 073/247] gpio: stmpe: i2c transfer are forbiden in atomic context commit b888fb6f2a278442933e3bfab70262e9a5365fb3 upstream. Move the workaround from stmpe_gpio_irq_unmask() which is executed in atomic context to stmpe_gpio_irq_sync_unlock() which is not. It fixes the following issue: [ 1.500000] BUG: scheduling while atomic: swapper/1/0x00000002 [ 1.500000] CPU: 0 PID: 1 Comm: swapper Not tainted 4.15.0-rc2-00020-gbd4301f-dirty #28 [ 1.520000] Hardware name: STM32 (Device Tree Support) [ 1.520000] [<0000bfc9>] (unwind_backtrace) from [<0000b347>] (show_stack+0xb/0xc) [ 1.530000] [<0000b347>] (show_stack) from [<0001fc49>] (__schedule_bug+0x39/0x58) [ 1.530000] [<0001fc49>] (__schedule_bug) from [<00168211>] (__schedule+0x23/0x2b2) [ 1.550000] [<00168211>] (__schedule) from [<001684f7>] (schedule+0x57/0x64) [ 1.550000] [<001684f7>] (schedule) from [<0016a513>] (schedule_timeout+0x137/0x164) [ 1.550000] [<0016a513>] (schedule_timeout) from [<00168b91>] (wait_for_common+0x8d/0xfc) [ 1.570000] [<00168b91>] (wait_for_common) from [<00139753>] (stm32f4_i2c_xfer+0xe9/0xfe) [ 1.580000] [<00139753>] (stm32f4_i2c_xfer) from [<00138545>] (__i2c_transfer+0x111/0x148) [ 1.590000] [<00138545>] (__i2c_transfer) from [<001385cf>] (i2c_transfer+0x53/0x70) [ 1.590000] [<001385cf>] (i2c_transfer) from [<001388a5>] (i2c_smbus_xfer+0x12f/0x36e) [ 1.600000] [<001388a5>] (i2c_smbus_xfer) from [<00138b49>] (i2c_smbus_read_byte_data+0x1f/0x2a) [ 1.610000] [<00138b49>] (i2c_smbus_read_byte_data) from [<00124fdd>] (__stmpe_reg_read+0xd/0x24) [ 1.620000] [<00124fdd>] (__stmpe_reg_read) from [<001252b3>] (stmpe_reg_read+0x19/0x24) [ 1.630000] [<001252b3>] (stmpe_reg_read) from [<0002c4d1>] (unmask_irq+0x17/0x22) [ 1.640000] [<0002c4d1>] (unmask_irq) from [<0002c57f>] (irq_startup+0x6f/0x78) [ 1.650000] [<0002c57f>] (irq_startup) from [<0002b7a1>] (__setup_irq+0x319/0x47c) [ 1.650000] [<0002b7a1>] (__setup_irq) from [<0002bad3>] (request_threaded_irq+0x6b/0xe8) [ 1.660000] [<0002bad3>] (request_threaded_irq) from [<0002d0b9>] (devm_request_threaded_irq+0x3b/0x6a) [ 1.670000] [<0002d0b9>] (devm_request_threaded_irq) from [<001446e7>] (mmc_gpiod_request_cd_irq+0x49/0x8a) [ 1.680000] [<001446e7>] (mmc_gpiod_request_cd_irq) from [<0013d45d>] (mmc_start_host+0x49/0x60) [ 1.690000] [<0013d45d>] (mmc_start_host) from [<0013e40b>] (mmc_add_host+0x3b/0x54) [ 1.700000] [<0013e40b>] (mmc_add_host) from [<00148119>] (mmci_probe+0x4d1/0x60c) [ 1.710000] [<00148119>] (mmci_probe) from [<000f903b>] (amba_probe+0x7b/0xbe) [ 1.720000] [<000f903b>] (amba_probe) from [<001170e5>] (driver_probe_device+0x169/0x1f8) [ 1.730000] [<001170e5>] (driver_probe_device) from [<001171b7>] (__driver_attach+0x43/0x5c) [ 1.740000] [<001171b7>] (__driver_attach) from [<0011618d>] (bus_for_each_dev+0x3d/0x46) [ 1.740000] [<0011618d>] (bus_for_each_dev) from [<001165cd>] (bus_add_driver+0xcd/0x124) [ 1.740000] [<001165cd>] (bus_add_driver) from [<00117713>] (driver_register+0x4d/0x7a) [ 1.760000] [<00117713>] (driver_register) from [<001fc765>] (do_one_initcall+0xbd/0xe8) [ 1.770000] [<001fc765>] (do_one_initcall) from [<001fc88b>] (kernel_init_freeable+0xfb/0x134) [ 1.780000] [<001fc88b>] (kernel_init_freeable) from [<00167ee3>] (kernel_init+0x7/0x9c) [ 1.790000] [<00167ee3>] (kernel_init) from [<00009b65>] (ret_from_fork+0x11/0x2c) Signed-off-by: Alexandre TORGUE Signed-off-by: Patrice Chotard Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-stmpe.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c index adba614b39656..abb5a27525110 100644 --- a/drivers/gpio/gpio-stmpe.c +++ b/drivers/gpio/gpio-stmpe.c @@ -190,6 +190,16 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) }; int i, j; + /* + * STMPE1600: to be able to get IRQ from pins, + * a read must be done on GPMR register, or a write in + * GPSR or GPCR registers + */ + if (stmpe->partnum == STMPE1600) { + stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_LSB]); + stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_CSB]); + } + for (i = 0; i < CACHE_NR_REGS; i++) { /* STMPE801 and STMPE1600 don't have RE and FE registers */ if ((stmpe->partnum == STMPE801 || @@ -227,21 +237,11 @@ static void stmpe_gpio_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct stmpe_gpio *stmpe_gpio = gpiochip_get_data(gc); - struct stmpe *stmpe = stmpe_gpio->stmpe; int offset = d->hwirq; int regoffset = offset / 8; int mask = BIT(offset % 8); stmpe_gpio->regs[REG_IE][regoffset] |= mask; - - /* - * STMPE1600 workaround: to be able to get IRQ from pins, - * a read must be done on GPMR register, or a write in - * GPSR or GPCR registers - */ - if (stmpe->partnum == STMPE1600) - stmpe_reg_read(stmpe, - stmpe->regs[STMPE_IDX_GPMR_LSB + regoffset]); } static void stmpe_dbg_show_one(struct seq_file *s, From cc1fa4a7b653a4f0a4c95a26ce842d699e8e4b1a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 22 Jan 2018 13:19:28 +0100 Subject: [PATCH 074/247] gpio: Fix kernel stack leak to userspace commit 24bd3efc9d1efb5f756a7c6f807a36ddb6adc671 upstream. The GPIO event descriptor was leaking kernel stack to userspace because we don't zero the variable before use. Ooops. Fix this. Reported-by: Arnd Bergmann Reviewed-by: Bartosz Golaszewski Reviewed-by: Arnd Bergmann Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpiolib.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 063d176baa24a..f3c3680963b94 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -705,6 +705,9 @@ static irqreturn_t lineevent_irq_thread(int irq, void *p) struct gpioevent_data ge; int ret, level; + /* Do not leak kernel stack to userspace */ + memset(&ge, 0, sizeof(ge)); + ge.timestamp = ktime_get_real_ns(); level = gpiod_get_value_cansleep(le->desc); From f1803207b5eab7fc9540f633b3bb73a75ad82494 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Sun, 26 Nov 2017 00:16:46 +0100 Subject: [PATCH 075/247] crypto: ecdh - fix typo in KPP dependency of CRYPTO_ECDH commit b5b9007730ce1d90deaf25d7f678511550744bdc upstream. This fixes a typo in the CRYPTO_KPP dependency of CRYPTO_ECDH. Fixes: 3c4b23901a0c ("crypto: ecdh - Add ECDH software support") Signed-off-by: Hauke Mehrtens Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index 84d71482bf080..ab0d93ab56952 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -120,7 +120,7 @@ config CRYPTO_DH config CRYPTO_ECDH tristate "ECDH algorithm" - select CRYTPO_KPP + select CRYPTO_KPP help Generic implementation of the ECDH algorithm From 95259cb008ba7ec88d3869db33377a111a6d9dee Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 18 Jan 2018 20:41:09 +0100 Subject: [PATCH 076/247] crypto: aesni - handle zero length dst buffer commit 9c674e1e2f9e24fa4392167efe343749008338e0 upstream. GCM can be invoked with a zero destination buffer. This is possible if the AAD and the ciphertext have zero lengths and only the tag exists in the source buffer (i.e. a source buffer cannot be zero). In this case, the GCM cipher only performs the authentication and no decryption operation. When the destination buffer has zero length, it is possible that no page is mapped to the SG pointing to the destination. In this case, sg_page(req->dst) is an invalid access. Therefore, page accesses should only be allowed if the req->dst->length is non-zero which is the indicator that a page must exist. This fixes a crash that can be triggered by user space via AF_ALG. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/aesni-intel_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index aa8b0672f87a4..d9ae404f08c94 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -906,7 +906,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req) if (sg_is_last(req->src) && req->src->offset + req->src->length <= PAGE_SIZE && - sg_is_last(req->dst) && ++ sg_is_last(req->dst) && req->dst->length && req->dst->offset + req->dst->length <= PAGE_SIZE) { one_entry_in_sg = 1; scatterwalk_start(&src_sg_walk, req->src); From 1ce8e52f6f368f9d62b6320357bf7c26a1480ecb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 19 Jan 2018 12:04:33 +0000 Subject: [PATCH 077/247] crypto: sha3-generic - fixes for alignment and big endian operation commit c013cee99d5a18aec8c71fee8f5f41369cd12595 upstream. Ensure that the input is byte swabbed before injecting it into the SHA3 transform. Use the get_unaligned() accessor for this so that we don't perform unaligned access inadvertently on architectures that do not support that. Fixes: 53964b9ee63b7075 ("crypto: sha3 - Add SHA-3 hash algorithm") Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/sha3_generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index 7e8ed96236cef..a68be626017c2 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -18,6 +18,7 @@ #include #include #include +#include #define KECCAK_ROUNDS 24 @@ -149,7 +150,7 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, unsigned int i; for (i = 0; i < sctx->rsizw; i++) - sctx->st[i] ^= ((u64 *) src)[i]; + sctx->st[i] ^= get_unaligned_le64(src + 8 * i); keccakf(sctx->st); done += sctx->rsiz; @@ -174,7 +175,7 @@ static int sha3_final(struct shash_desc *desc, u8 *out) sctx->buf[sctx->rsiz - 1] |= 0x80; for (i = 0; i < sctx->rsizw; i++) - sctx->st[i] ^= ((u64 *) sctx->buf)[i]; + sctx->st[i] ^= get_unaligned_le64(sctx->buf + 8 * i); keccakf(sctx->st); From b7edc45f3adace20d74788dd4d5a085c134ba6be Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Tue, 2 Jan 2018 08:55:25 +0100 Subject: [PATCH 078/247] crypto: af_alg - whitelist mask and type commit bb30b8848c85e18ca7e371d0a869e94b3e383bdf upstream. The user space interface allows specifying the type and mask field used to allocate the cipher. Only a subset of the possible flags are intended for user space. Therefore, white-list the allowed flags. In case the user space caller uses at least one non-allowed flag, EINVAL is returned. Reported-by: syzbot Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/af_alg.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index f5e18c2a48527..ca50eeb13097c 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -149,7 +149,7 @@ EXPORT_SYMBOL_GPL(af_alg_release_parent); static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - const u32 forbidden = CRYPTO_ALG_INTERNAL; + const u32 allowed = CRYPTO_ALG_KERN_DRIVER_ONLY; struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct sockaddr_alg *sa = (void *)uaddr; @@ -157,6 +157,10 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) void *private; int err; + /* If caller uses non-allowed flag, return error. */ + if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed)) + return -EINVAL; + if (sock->state == SS_CONNECTED) return -EINVAL; @@ -175,9 +179,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (IS_ERR(type)) return PTR_ERR(type); - private = type->bind(sa->salg_name, - sa->salg_feat & ~forbidden, - sa->salg_mask & ~forbidden); + private = type->bind(sa->salg_name, sa->salg_feat, sa->salg_mask); if (IS_ERR(private)) { module_put(type->owner); return PTR_ERR(private); From ddba3c67a5b81955e34563d225a3f117354a247e Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Thu, 7 Dec 2017 12:31:56 -0800 Subject: [PATCH 079/247] HID: wacom: EKR: ensure devres groups at higher indexes are released commit 791ae273731fa85d3332e45064dab177ae663e80 upstream. Background: ExpressKey Remotes communicate their events via usb dongle. Each dongle can hold up to 5 pairings at one time and one EKR (identified by its serial number) can unfortunately be paired with its dongle more than once. The pairing takes place in a round-robin fashion. Input devices are only created once per EKR, when a new serial number is seen in the list of pairings. However, if a device is created for a "higher" paring index and subsequently a second pairing occurs at a lower pairing index, unpairing the remote with that serial number from any pairing index will currently cause a driver crash. This occurs infrequently, as two remotes are necessary to trigger this bug and most users have only one remote. As an illustration, to trigger the bug you need to have two remotes, and pair them in this order: 1. slot 0 -> remote 1 (input device created for remote 1) 2. slot 1 -> remote 1 (duplicate pairing - no device created) 3. slot 2 -> remote 1 (duplicate pairing - no device created) 4. slot 3 -> remote 1 (duplicate pairing - no device created) 5. slot 4 -> remote 2 (input device created for remote 2) 6. slot 0 -> remote 2 (1 destroyed and recreated at slot 1) 7. slot 1 -> remote 2 (1 destroyed and recreated at slot 2) 8. slot 2 -> remote 2 (1 destroyed and recreated at slot 3) 9. slot 3 -> remote 2 (1 destroyed and not recreated) 10. slot 4 -> remote 2 (2 was already in this slot so no changes) 11. slot 0 -> remote 1 (The current code sees remote 2 was paired over in one of the dongle slots it occupied and attempts to remove all information about remote 2 [1]. It calls wacom_remote_destroy_one for remote 2, but the destroy function assumes the lowest index is where the remote's input device was created. The code "cleans up" the other remote 2 pairings including the one which the input device was based on, assuming they were were just duplicate pairings. However, the cleanup doesn't call the devres release function for the input device that was created in slot 4). This issue is fixed by this commit. [1] Remote 2 should subsequently be re-created on the next packet from the EKR at the lowest numbered slot that it occupies (here slot 1). Fixes: f9036bd43602 ("HID: wacom: EKR: use devres groups to manage resources") Signed-off-by: Aaron Armstrong Skomra Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/wacom_sys.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index d72dfb2bbdb8e..7a4d39ce51d91 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2192,23 +2192,23 @@ static void wacom_remote_destroy_one(struct wacom *wacom, unsigned int index) int i; unsigned long flags; - spin_lock_irqsave(&remote->remote_lock, flags); - remote->remotes[index].registered = false; - spin_unlock_irqrestore(&remote->remote_lock, flags); + for (i = 0; i < WACOM_MAX_REMOTES; i++) { + if (remote->remotes[i].serial == serial) { - if (remote->remotes[index].battery.battery) - devres_release_group(&wacom->hdev->dev, - &remote->remotes[index].battery.bat_desc); + spin_lock_irqsave(&remote->remote_lock, flags); + remote->remotes[i].registered = false; + spin_unlock_irqrestore(&remote->remote_lock, flags); - if (remote->remotes[index].group.name) - devres_release_group(&wacom->hdev->dev, - &remote->remotes[index]); + if (remote->remotes[i].battery.battery) + devres_release_group(&wacom->hdev->dev, + &remote->remotes[i].battery.bat_desc); + + if (remote->remotes[i].group.name) + devres_release_group(&wacom->hdev->dev, + &remote->remotes[i]); - for (i = 0; i < WACOM_MAX_REMOTES; i++) { - if (remote->remotes[i].serial == serial) { remote->remotes[i].serial = 0; remote->remotes[i].group.name = NULL; - remote->remotes[i].registered = false; remote->remotes[i].battery.battery = NULL; wacom->led.groups[i].select = WACOM_STATUS_UNKNOWN; } From 517931760e696c779e4d24424633c334a6447450 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 20 Nov 2017 12:58:27 -0800 Subject: [PATCH 080/247] power: reset: zx-reboot: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit 348c7cf5fcbcb68838255759d4cb45d039af36d2 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/power/reset/zx-reboot.o see include/linux/module.h for more information This adds the license as "GPL v2", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Signed-off-by: Sebastian Reichel Signed-off-by: Greg Kroah-Hartman --- drivers/power/reset/zx-reboot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/power/reset/zx-reboot.c b/drivers/power/reset/zx-reboot.c index b0b1eb3a78c28..76153ac0706cb 100644 --- a/drivers/power/reset/zx-reboot.c +++ b/drivers/power/reset/zx-reboot.c @@ -81,3 +81,7 @@ static struct platform_driver zx_reboot_driver = { }, }; module_platform_driver(zx_reboot_driver); + +MODULE_DESCRIPTION("ZTE SoCs reset driver"); +MODULE_AUTHOR("Jun Nie "); +MODULE_LICENSE("GPL v2"); From cb1a0b51d1031ce335d16d7528cace677f105106 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 20 Nov 2017 12:54:52 -0800 Subject: [PATCH 081/247] gpio: iop: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit 97b03136e1b637d7a9d2274c099e44ecf23f1103 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-iop.o see include/linux/module.h for more information This adds the license as "GPL", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-iop.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpio/gpio-iop.c b/drivers/gpio/gpio-iop.c index 98c7ff2a76e76..8d62db447ec1b 100644 --- a/drivers/gpio/gpio-iop.c +++ b/drivers/gpio/gpio-iop.c @@ -58,3 +58,7 @@ static int __init iop3xx_gpio_init(void) return platform_driver_register(&iop3xx_gpio_driver); } arch_initcall(iop3xx_gpio_init); + +MODULE_DESCRIPTION("GPIO handling for Intel IOP3xx processors"); +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_LICENSE("GPL"); From e29997d55268c9d4d4fc2bb7833f83dc5b220cd5 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 20 Nov 2017 12:54:26 -0800 Subject: [PATCH 082/247] gpio: ath79: add missing MODULE_DESCRIPTION/LICENSE commit 539340f37e6d6ed4cd93e8e18c9b2e4eafd4b842 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/gpio/gpio-ath79.o see include/linux/module.h for more information This adds the license as "GPL v2", which matches the header of the file. MODULE_DESCRIPTION is also added. Signed-off-by: Jesse Chan Acked-by: Alban Bedel Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-ath79.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpio/gpio-ath79.c b/drivers/gpio/gpio-ath79.c index dc37dbe4b46d8..a83e97e15c142 100644 --- a/drivers/gpio/gpio-ath79.c +++ b/drivers/gpio/gpio-ath79.c @@ -323,3 +323,6 @@ static struct platform_driver ath79_gpio_driver = { }; module_platform_driver(ath79_gpio_driver); + +MODULE_DESCRIPTION("Atheros AR71XX/AR724X/AR913X GPIO API support"); +MODULE_LICENSE("GPL v2"); From 3a98d0753928cb8260930f2b651ecef47a0c3037 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 20 Nov 2017 12:57:13 -0800 Subject: [PATCH 083/247] mtd: nand: denali_pci: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit d822401d1c6898a4a4ee03977b78b8cec402e88a upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/mtd/nand/denali_pci.o see include/linux/module.h for more information This adds the license as "GPL v2", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Acked-by: Masahiro Yamada Signed-off-by: Boris Brezillon Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/nand/denali_pci.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/mtd/nand/denali_pci.c b/drivers/mtd/nand/denali_pci.c index de31514df2822..d38527e0a2f2f 100644 --- a/drivers/mtd/nand/denali_pci.c +++ b/drivers/mtd/nand/denali_pci.c @@ -119,3 +119,7 @@ static struct pci_driver denali_pci_driver = { }; module_pci_driver(denali_pci_driver); + +MODULE_DESCRIPTION("PCI driver for Denali NAND controller"); +MODULE_AUTHOR("Intel Corporation and its suppliers"); +MODULE_LICENSE("GPL v2"); From 30942f91b5a57a1ca7557998e901367901d1bccf Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Tue, 12 Dec 2017 14:31:30 -0500 Subject: [PATCH 084/247] igb: Free IRQs when device is hotplugged commit 888f22931478a05bc81ceb7295c626e1292bf0ed upstream. Recently I got a Caldigit TS3 Thunderbolt 3 dock, and noticed that upon hotplugging my kernel would immediately crash due to igb: [ 680.825801] kernel BUG at drivers/pci/msi.c:352! [ 680.828388] invalid opcode: 0000 [#1] SMP [ 680.829194] Modules linked in: igb(O) thunderbolt i2c_algo_bit joydev vfat fat btusb btrtl btbcm btintel bluetooth ecdh_generic hp_wmi sparse_keymap rfkill wmi_bmof iTCO_wdt intel_rapl x86_pkg_temp_thermal coretemp crc32_pclmul snd_pcm rtsx_pci_ms mei_me snd_timer memstick snd pcspkr mei soundcore i2c_i801 tpm_tis psmouse shpchp wmi tpm_tis_core tpm video hp_wireless acpi_pad rtsx_pci_sdmmc mmc_core crc32c_intel serio_raw rtsx_pci mfd_core xhci_pci xhci_hcd i2c_hid i2c_core [last unloaded: igb] [ 680.831085] CPU: 1 PID: 78 Comm: kworker/u16:1 Tainted: G O 4.15.0-rc3Lyude-Test+ #6 [ 680.831596] Hardware name: HP HP ZBook Studio G4/826B, BIOS P71 Ver. 01.03 06/09/2017 [ 680.832168] Workqueue: kacpi_hotplug acpi_hotplug_work_fn [ 680.832687] RIP: 0010:free_msi_irqs+0x180/0x1b0 [ 680.833271] RSP: 0018:ffffc9000030fbf0 EFLAGS: 00010286 [ 680.833761] RAX: ffff8803405f9c00 RBX: ffff88033e3d2e40 RCX: 000000000000002c [ 680.834278] RDX: 0000000000000000 RSI: 00000000000000ac RDI: ffff880340be2178 [ 680.834832] RBP: 0000000000000000 R08: ffff880340be1ff0 R09: ffff8803405f9c00 [ 680.835342] R10: 0000000000000000 R11: 0000000000000040 R12: ffff88033d63a298 [ 680.835822] R13: ffff88033d63a000 R14: 0000000000000060 R15: ffff880341959000 [ 680.836332] FS: 0000000000000000(0000) GS:ffff88034f440000(0000) knlGS:0000000000000000 [ 680.836817] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 680.837360] CR2: 000055e64044afdf CR3: 0000000001c09002 CR4: 00000000003606e0 [ 680.837954] Call Trace: [ 680.838853] pci_disable_msix+0xce/0xf0 [ 680.839616] igb_reset_interrupt_capability+0x5d/0x60 [igb] [ 680.840278] igb_remove+0x9d/0x110 [igb] [ 680.840764] pci_device_remove+0x36/0xb0 [ 680.841279] device_release_driver_internal+0x157/0x220 [ 680.841739] pci_stop_bus_device+0x7d/0xa0 [ 680.842255] pci_stop_bus_device+0x2b/0xa0 [ 680.842722] pci_stop_bus_device+0x3d/0xa0 [ 680.843189] pci_stop_and_remove_bus_device+0xe/0x20 [ 680.843627] trim_stale_devices+0xf3/0x140 [ 680.844086] trim_stale_devices+0x94/0x140 [ 680.844532] trim_stale_devices+0xa6/0x140 [ 680.845031] ? get_slot_status+0x90/0xc0 [ 680.845536] acpiphp_check_bridge.part.5+0xfe/0x140 [ 680.846021] acpiphp_hotplug_notify+0x175/0x200 [ 680.846581] ? free_bridge+0x100/0x100 [ 680.847113] acpi_device_hotplug+0x8a/0x490 [ 680.847535] acpi_hotplug_work_fn+0x1a/0x30 [ 680.848076] process_one_work+0x182/0x3a0 [ 680.848543] worker_thread+0x2e/0x380 [ 680.848963] ? process_one_work+0x3a0/0x3a0 [ 680.849373] kthread+0x111/0x130 [ 680.849776] ? kthread_create_worker_on_cpu+0x50/0x50 [ 680.850188] ret_from_fork+0x1f/0x30 [ 680.850601] Code: 43 14 85 c0 0f 84 d5 fe ff ff 31 ed eb 0f 83 c5 01 39 6b 14 0f 86 c5 fe ff ff 8b 7b 10 01 ef e8 b7 e4 d2 ff 48 83 78 70 00 74 e3 <0f> 0b 49 8d b5 a0 00 00 00 e8 62 6f d3 ff e9 c7 fe ff ff 48 8b [ 680.851497] RIP: free_msi_irqs+0x180/0x1b0 RSP: ffffc9000030fbf0 As it turns out, normally the freeing of IRQs that would fix this is called inside of the scope of __igb_close(). However, since the device is already gone by the point we try to unregister the netdevice from the driver due to a hotplug we end up seeing that the netif isn't present and thus, forget to free any of the device IRQs. So: make sure that if we're in the process of dismantling the netdev, we always allow __igb_close() to be called so that IRQs may be freed normally. Additionally, only allow igb_close() to be called from __igb_close() if it hasn't already been called for the given adapter. Signed-off-by: Lyude Paul Fixes: 9474933caf21 ("igb: close/suspend race in netif_device_detach") Cc: Todd Fujinaka Cc: Stephen Hemminger Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index ca54f76846684..3a61491421b1d 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3273,7 +3273,7 @@ static int __igb_close(struct net_device *netdev, bool suspending) int igb_close(struct net_device *netdev) { - if (netif_device_present(netdev)) + if (netif_device_present(netdev) || netdev->dismantle) return __igb_close(netdev, false); return 0; } From 036c227cdd1ca8e5e7e59e18ac87de3003f90d23 Mon Sep 17 00:00:00 2001 From: Stefan Schake Date: Fri, 10 Nov 2017 02:05:06 +0100 Subject: [PATCH 085/247] drm/vc4: Account for interrupts in flight [ Upstream commit 253696ccd613fbdaa5aba1de44c461a058e0a114 ] Synchronously disable the IRQ to make the following cancel_work_sync invocation effective. An interrupt in flight could enqueue further overflow mem work. As we free the binner BO immediately following vc4_irq_uninstall this caused a NULL pointer dereference in the work callback vc4_overflow_mem_work. Link: https://github.com/anholt/linux/issues/114 Signed-off-by: Stefan Schake Fixes: d5b1a78a772f ("drm/vc4: Add support for drawing 3D frames.") Signed-off-by: Eric Anholt Reviewed-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/1510275907-993-2-git-send-email-stschake@gmail.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vc4/vc4_irq.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index 094bc6a475c17..d45a7c0a7915b 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -208,6 +208,9 @@ vc4_irq_postinstall(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); + /* Undo the effects of a previous vc4_irq_uninstall. */ + enable_irq(dev->irq); + /* Enable both the render done and out of memory interrupts. */ V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); @@ -225,6 +228,9 @@ vc4_irq_uninstall(struct drm_device *dev) /* Clear any pending interrupts we might have left. */ V3D_WRITE(V3D_INTCTL, V3D_DRIVER_IRQS); + /* Finish any interrupt handler still in flight. */ + disable_irq(dev->irq); + cancel_work_sync(&vc4->overflow_mem_work); } From 82e57cdce058713d3546f525140599c3b9b339fb Mon Sep 17 00:00:00 2001 From: Abhishek Goel Date: Tue, 7 Nov 2017 15:17:55 +0530 Subject: [PATCH 086/247] cpupowerutils: bench - Fix cpu online check [ Upstream commit 53d1cd6b125fb9d69303516a1179ebc3b72f797a ] cpupower_is_cpu_online was incorrectly checking for 0. This patch fixes this by checking for 1 when the cpu is online. Signed-off-by: Abhishek Goel Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/power/cpupower/bench/system.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/bench/system.c b/tools/power/cpupower/bench/system.c index c25a74ae51bae..2bb3eef7d5c1f 100644 --- a/tools/power/cpupower/bench/system.c +++ b/tools/power/cpupower/bench/system.c @@ -61,7 +61,7 @@ int set_cpufreq_governor(char *governor, unsigned int cpu) dprintf("set %s as cpufreq governor\n", governor); - if (cpupower_is_cpu_online(cpu) != 0) { + if (cpupower_is_cpu_online(cpu) != 1) { perror("cpufreq_cpu_exists"); fprintf(stderr, "error: cpu %u does not exist\n", cpu); return -1; From d8f75b4c7f259307ab9d0f4f304476e2b701c6ef Mon Sep 17 00:00:00 2001 From: Abhishek Goel Date: Wed, 15 Nov 2017 14:10:02 +0530 Subject: [PATCH 087/247] cpupower : Fix cpupower working when cpu0 is offline [ Upstream commit dbdc468f35ee827cab2753caa1c660bdb832243a ] cpuidle_monitor used to assume that cpu0 is always online which is not a valid assumption on POWER machines. This patch fixes this by getting the cpu on which the current thread is running, instead of always using cpu0 for monitoring which may not be online. Signed-off-by: Abhishek Goel Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c index 1b5da0066ebf9..5b3205f162174 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c +++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c @@ -130,15 +130,18 @@ static struct cpuidle_monitor *cpuidle_register(void) { int num; char *tmp; + int this_cpu; + + this_cpu = sched_getcpu(); /* Assume idle state count is the same for all CPUs */ - cpuidle_sysfs_monitor.hw_states_num = cpuidle_state_count(0); + cpuidle_sysfs_monitor.hw_states_num = cpuidle_state_count(this_cpu); if (cpuidle_sysfs_monitor.hw_states_num <= 0) return NULL; for (num = 0; num < cpuidle_sysfs_monitor.hw_states_num; num++) { - tmp = cpuidle_state_name(0, num); + tmp = cpuidle_state_name(this_cpu, num); if (tmp == NULL) continue; @@ -146,7 +149,7 @@ static struct cpuidle_monitor *cpuidle_register(void) strncpy(cpuidle_cstates[num].name, tmp, CSTATE_NAME_LEN - 1); free(tmp); - tmp = cpuidle_state_desc(0, num); + tmp = cpuidle_state_desc(this_cpu, num); if (tmp == NULL) continue; strncpy(cpuidle_cstates[num].desc, tmp, CSTATE_DESC_LEN - 1); From 60d9b22b1ffc09a28be556e31caf7cdffe0df7ed Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 5 Nov 2017 16:56:33 +0200 Subject: [PATCH 088/247] KVM: x86: emulator: Return to user-mode on L1 CPL=0 emulation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1f4dcb3b213235e642088709a1c54964d23365e9 ] On this case, handle_emulation_failure() fills kvm_run with internal-error information which it expects to be delivered to user-mode for further processing. However, the code reports a wrong return-value which makes KVM to never return to user-mode on this scenario. Fixes: 6d77dbfc88e3 ("KVM: inject #UD if instruction emulation fails and exit to userspace") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d3f80cccb9aac..6c11a98860d10 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5308,7 +5308,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_FAIL; + r = EMULATE_USER_EXIT; } kvm_queue_exception(vcpu, UD_VECTOR); From 114de9bfefa5aee862815b0ae7453d40bf5278c2 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 5 Nov 2017 16:56:34 +0200 Subject: [PATCH 089/247] KVM: x86: Don't re-execute instruction when not passing CR2 value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 9b8ae63798cb97e785a667ff27e43fa6220cb734 ] In case of instruction-decode failure or emulation failure, x86_emulate_instruction() will call reexecute_instruction() which will attempt to use the cr2 value passed to x86_emulate_instruction(). However, when x86_emulate_instruction() is called from emulate_instruction(), cr2 is not passed (passed as 0) and therefore it doesn't make sense to execute reexecute_instruction() logic at all. Fixes: 51d8b66199e9 ("KVM: cleanup emulate_instruction") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Wanpeng Li Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/vmx.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index cbd1d44da2d38..20cfeeb681c60 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1113,7 +1113,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, static inline int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { - return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); + return x86_emulate_instruction(vcpu, 0, + emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); } void kvm_enable_efer_bits(u64); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3ca6d15994e45..b9673bc82caa4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6257,7 +6257,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) return 1; - err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); + err = emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; From ec73d16bc7ce352375052d81a22bbb53e3eccabe Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 5 Nov 2017 16:54:47 -0800 Subject: [PATCH 090/247] KVM: X86: Fix operand/address-size during instruction decoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 3853be2603191829b442b64dac6ae8ba0c027bf9 ] Pedro reported: During tests that we conducted on KVM, we noticed that executing a "PUSH %ES" instruction under KVM produces different results on both memory and the SP register depending on whether EPT support is enabled. With EPT the SP is reduced by 4 bytes (and the written value is 0-padded) but without EPT support it is only reduced by 2 bytes. The difference can be observed when the CS.DB field is 1 (32-bit) but not when it's 0 (16-bit). The internal segment descriptor cache exist even in real/vm8096 mode. The CS.D also should be respected instead of just default operand/address-size/66H prefix/67H prefix during instruction decoding. This patch fixes it by also adjusting operand/address-size according to CS.D. Reported-by: Pedro Fonseca Tested-by: Pedro Fonseca Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Nadav Amit Cc: Pedro Fonseca Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/emulate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c8f8dd8ca0a14..6f5a3b0763410 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4990,6 +4990,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) bool op_prefix = false; bool has_seg_override = false; struct opcode opcode; + u16 dummy; + struct desc_struct desc; ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; @@ -5008,6 +5010,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) switch (mode) { case X86EMUL_MODE_REAL: case X86EMUL_MODE_VM86: + def_op_bytes = def_ad_bytes = 2; + ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); + if (desc.d) + def_op_bytes = def_ad_bytes = 4; + break; case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; From 2f9e94ef498db9e7f4d920d9dcd042fa378e3f03 Mon Sep 17 00:00:00 2001 From: Nikita Leshenko Date: Sun, 5 Nov 2017 15:52:29 +0200 Subject: [PATCH 091/247] KVM: x86: ioapic: Fix level-triggered EOI and IOAPIC reconfigure race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0fc5a36dd6b345eb0d251a65c236e53bead3eef7 ] KVM uses ioapic_handled_vectors to track vectors that need to notify the IOAPIC on EOI. The problem is that IOAPIC can be reconfigured while an interrupt with old configuration is pending or running and ioapic_handled_vectors only remembers the newest configuration; thus EOI from the old interrupt is not delievered to the IOAPIC. A previous commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") addressed this issue by adding pending edge-triggered interrupts to ioapic_handled_vectors, fixing this race for edge-triggered interrupts. The commit explicitly ignored level-triggered interrupts, but this race applies to them as well: 1) IOAPIC sends a level triggered interrupt vector to VCPU0 2) VCPU0's handler deasserts the irq line and reconfigures the IOAPIC to route the vector to VCPU1. The reconfiguration rewrites only the upper 32 bits of the IOREDTBLn register. (Causes KVM to update ioapic_handled_vectors for VCPU0 and it no longer includes the vector.) 3) VCPU0 sends EOI for the vector, but it's not delievered to the IOAPIC because the ioapic_handled_vectors doesn't include the vector. 4) New interrupts are not delievered to VCPU1 because remote_irr bit is set forever. Therefore, the correct behavior is to add all pending and running interrupts to ioapic_handled_vectors. This commit introduces a slight performance hit similar to commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") for the rare case that the vector is reused by a non-IOAPIC source on VCPU0. We prefer to keep solution simple and not handle this case just as the original commit does. Fixes: db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") Signed-off-by: Nikita Leshenko Reviewed-by: Liran Alon Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/ioapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 6e219e5c07d27..a7ac8688bba87 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { if (kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, e->fields.dest_mode) || - (e->fields.trig_mode == IOAPIC_EDGE_TRIG && - kvm_apic_pending_eoi(vcpu, e->fields.vector))) + kvm_apic_pending_eoi(vcpu, e->fields.vector)) __set_bit(e->fields.vector, ioapic_handled_vectors); } From a9f2c169366752927fd0bf8f5a9b78d3315f6c85 Mon Sep 17 00:00:00 2001 From: Nikita Leshenko Date: Sun, 5 Nov 2017 15:52:32 +0200 Subject: [PATCH 092/247] KVM: x86: ioapic: Clear Remote IRR when entry is switched to edge-triggered MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a8bfec2930525808c01f038825d1df3904638631 ] Some OSes (Linux, Xen) use this behavior to clear the Remote IRR bit for IOAPICs without an EOI register. They simulate the EOI message manually by changing the trigger mode to edge and then back to level, with the entry being masked during this. QEMU implements this feature in commit ed1263c363c9 ("ioapic: clear remote irr bit for edge-triggered interrupts") As a side effect, this commit removes an incorrect behavior where Remote IRR was cleared when the redirection table entry was rewritten. This is not consistent with the manual and also opens an opportunity for a strange behavior when a redirection table entry is modified from an interrupt handler that handles the same entry: The modification will clear the Remote IRR bit even though the interrupt handler is still running. Signed-off-by: Nikita Leshenko Reviewed-by: Liran Alon Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Wanpeng Li Reviewed-by: Steve Rutherford Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/ioapic.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index a7ac8688bba87..4b573c8694acb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -306,8 +306,17 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) } else { e->bits &= ~0xffffffffULL; e->bits |= (u32) val; - e->fields.remote_irr = 0; } + + /* + * Some OSes (Linux, Xen) assume that Remote IRR bit will + * be cleared by IOAPIC hardware when the entry is configured + * as edge-triggered. This behavior is used to simulate an + * explicit EOI on IOAPICs that don't have the EOI register. + */ + if (e->fields.trig_mode == IOAPIC_EDGE_TRIG) + e->fields.remote_irr = 0; + mask_after = e->fields.mask; if (mask_before != mask_after) kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); From 1d3ab3b2964e927515b947c8b112a9a40ed789c4 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 15 Oct 2017 21:24:49 +0200 Subject: [PATCH 093/247] ACPI / bus: Leave modalias empty for devices which are not present [ Upstream commit 10809bb976648ac58194a629e3d7af99e7400297 ] Most Bay and Cherry Trail devices use a generic DSDT with all possible peripheral devices present in the DSDT, with their _STA returning 0x00 or 0x0f based on AML variables which describe what is actually present on the board. Since ACPI device objects with a 0x00 status (not present) still get an entry under /sys/bus/acpi/devices, and those entry had an acpi:PNPID modalias, userspace would end up loading modules for non present hardware. This commit fixes this by leaving the modalias empty for non present devices. This results in 10 modules less being loaded with a generic distro kernel config on my Cherry Trail test-device (a GPD pocket). Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/device_sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/acpi/device_sysfs.c b/drivers/acpi/device_sysfs.c index 7b2c48fde4e2b..201c7ceb70524 100644 --- a/drivers/acpi/device_sysfs.c +++ b/drivers/acpi/device_sysfs.c @@ -146,6 +146,10 @@ static int create_pnp_modalias(struct acpi_device *acpi_dev, char *modalias, int count; struct acpi_hardware_id *id; + /* Avoid unnecessarily loading modules for non present devices. */ + if (!acpi_device_is_present(acpi_dev)) + return 0; + /* * Since we skip ACPI_DT_NAMESPACE_HID from the modalias below, 0 should * be returned if ACPI_DT_NAMESPACE_HID is the only ACPI/PNP ID in the From 876b31fd9815c57ae0389132f3306ff31f148092 Mon Sep 17 00:00:00 2001 From: Nikita Leshenko Date: Sun, 5 Nov 2017 15:52:33 +0200 Subject: [PATCH 094/247] KVM: x86: ioapic: Preserve read-only values in the redirection table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b200dded0a6974a3b69599832b2203483920ab25 ] According to 82093AA (IOAPIC) manual, Remote IRR and Delivery Status are read-only. QEMU implements the bits as RO in commit 479c2a1cb7fb ("ioapic: keep RO bits for IOAPIC entry"). Signed-off-by: Nikita Leshenko Reviewed-by: Liran Alon Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Wanpeng Li Reviewed-by: Steve Rutherford Signed-off-by: Radim Krčmář Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/ioapic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4b573c8694acb..5f810bb80802c 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -278,6 +278,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; + int old_remote_irr, old_delivery_status; union kvm_ioapic_redirect_entry *e; switch (ioapic->ioregsel) { @@ -300,6 +301,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) return; e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; + /* Preserve read-only fields */ + old_remote_irr = e->fields.remote_irr; + old_delivery_status = e->fields.delivery_status; if (ioapic->ioregsel & 1) { e->bits &= 0xffffffff; e->bits |= (u64) val << 32; @@ -307,6 +311,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) e->bits &= ~0xffffffffULL; e->bits |= (u32) val; } + e->fields.remote_irr = old_remote_irr; + e->fields.delivery_status = old_delivery_status; /* * Some OSes (Linux, Xen) assume that Remote IRR bit will From 6436981ba6d11dc13b242d48855944b73569739f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Nov 2017 21:17:55 +0000 Subject: [PATCH 095/247] cpufreq: Add Loongson machine dependencies [ Upstream commit 0d307935fefa6389eb726c6362351c162c949101 ] The MIPS loongson cpufreq drivers don't build unless configured for the correct machine type, due to dependency on machine specific architecture headers and symbols in machine specific platform code. More specifically loongson1-cpufreq.c uses RST_CPU_EN and RST_CPU, neither of which is defined in asm/mach-loongson32/regs-clk.h unless CONFIG_LOONGSON1_LS1B=y, and loongson2_cpufreq.c references loongson2_clockmod_table[], which is only defined in arch/mips/loongson64/lemote-2f/clock.c, i.e. when CONFIG_LEMOTE_MACH2F=y. Add these dependencies to Kconfig to avoid randconfig / allyesconfig build failures (e.g. when based on BMIPS which also has a cpufreq driver). Signed-off-by: James Hogan Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index d8b164a7c4e51..cac26fb228912 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -273,6 +273,7 @@ endif if MIPS config LOONGSON2_CPUFREQ tristate "Loongson2 CPUFreq Driver" + depends on LEMOTE_MACH2F help This option adds a CPUFreq driver for loongson processors which support software configurable cpu frequency. @@ -285,6 +286,7 @@ config LOONGSON2_CPUFREQ config LOONGSON1_CPUFREQ tristate "Loongson1 CPUFreq Driver" + depends on LOONGSON1_LS1B help This option adds a CPUFreq driver for loongson1 processors which support software configurable cpu frequency. From 409982cbb5eb814532953fb996ddc27bae5282a4 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 24 Nov 2017 15:14:27 -0800 Subject: [PATCH 096/247] bcache: check return value of register_shrinker [ Upstream commit 6c4ca1e36cdc1a0a7a84797804b87920ccbebf51 ] register_shrinker is now __must_check, so check it to kill a warning. Caller of bch_btree_cache_alloc in super.c appropriately checks return value so this is fully plumbed through. This V2 fixes checkpatch warnings and improves the commit description, as I was too hasty getting the previous version out. Signed-off-by: Michael Lyle Reviewed-by: Vojtech Pavlik Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/btree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2efdce07247cc..cac297f8170e2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -803,7 +803,10 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.scan_objects = bch_mca_scan; c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - register_shrinker(&c->shrink); + + if (register_shrinker(&c->shrink)) + pr_warn("bcache: %s: could not register shrinker", + __func__); return 0; } From 8afdbb165a792f1e89b0a2c67b54826889d3d51f Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 1 Nov 2017 19:21:55 -0400 Subject: [PATCH 097/247] drm/amdgpu: Fix SDMA load/unload sequence on HWS disabled mode [ Upstream commit cf21654b40968609779751b34e7923180968fe5b ] Fix the SDMA load and unload sequence as suggested by HW document. Signed-off-by: shaoyun liu Signed-off-by: Felix Kuehling Acked-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 1a0a5f7cccbc2..47951f4775b94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -367,29 +367,50 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd) { struct amdgpu_device *adev = get_amdgpu_device(kgd); struct cik_sdma_rlc_registers *m; + unsigned long end_jiffies; uint32_t sdma_base_addr; + uint32_t data; m = get_sdma_mqd(mqd); sdma_base_addr = get_sdma_base_addr(m); - WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, - m->sdma_rlc_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + m->sdma_rlc_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, - m->sdma_rlc_rb_base); + end_jiffies = msecs_to_jiffies(2000) + jiffies; + while (true) { + data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); + if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) + break; + if (time_after(jiffies, end_jiffies)) + return -ETIME; + usleep_range(500, 1000); + } + if (m->sdma_engine_id) { + data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL); + data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data); + } else { + data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL); + data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, + RESUME_CTX, 0); + WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data); + } + WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, + m->sdma_rlc_doorbell); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR, + m->sdma_rlc_virtual_addr); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base); WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, m->sdma_rlc_rb_base_hi); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, m->sdma_rlc_rb_rptr_addr_lo); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, m->sdma_rlc_rb_rptr_addr_hi); - - WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, - m->sdma_rlc_doorbell); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, m->sdma_rlc_rb_cntl); @@ -493,9 +514,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, } WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0); - WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0); + WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, + RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | + SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); return 0; } From 16980affa1b09fc536657ddb933902825ead1c38 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Wed, 1 Nov 2017 19:21:56 -0400 Subject: [PATCH 098/247] drm/amdkfd: Fix SDMA ring buffer size calculation [ Upstream commit d12fb13f23199faa7e536acec1db49068e5a067d ] ffs function return the position of the first bit set on 1 based. (bit zero returns 1). Signed-off-by: shaoyun liu Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index d83de985e88cf..8577a563600fe 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -215,8 +215,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, BUG_ON(!mm || !mqd || !q); m = get_sdma_mqd(mqd); - m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) << - SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; From 8275584082063726845b7ce2acd1b9d87cdee3fa Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 1 Nov 2017 19:21:57 -0400 Subject: [PATCH 099/247] drm/amdkfd: Fix SDMA oversubsription handling [ Upstream commit 8c946b8988acec785bcf67088b6bd0747f36d2d3 ] SDMA only supports a fixed number of queues. HWS cannot handle oversubscription. Signed-off-by: shaoyun liu Signed-off-by: Felix Kuehling Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../drm/amd/amdkfd/kfd_process_queue_manager.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index e1fb40b84c72f..5425c68d02877 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -205,6 +205,24 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: + if (dev->dqm->queue_count >= + CIK_SDMA_QUEUES_PER_ENGINE * CIK_SDMA_ENGINE_NUM) { + pr_err("Over-subscription is not allowed for SDMA.\n"); + retval = -EPERM; + goto err_create_queue; + } + + retval = create_cp_queue(pqm, dev, &q, properties, f, *qid); + if (retval != 0) + goto err_create_queue; + pqn->q = q; + pqn->kq = NULL; + retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd, + &q->properties.vmid); + pr_debug("DQM returned %d for create_queue\n", retval); + print_queue(q); + break; + case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && From 03899a46c223510160632e991bad3ad234da386d Mon Sep 17 00:00:00 2001 From: zhangliping Date: Sat, 25 Nov 2017 22:02:12 +0800 Subject: [PATCH 100/247] openvswitch: fix the incorrect flow action alloc size [ Upstream commit 67c8d22a73128ff910e2287567132530abcf5b71 ] If we want to add a datapath flow, which has more than 500 vxlan outputs' action, we will get the following error reports: openvswitch: netlink: Flow action size 32832 bytes exceeds max openvswitch: netlink: Flow action size 32832 bytes exceeds max openvswitch: netlink: Actions may not be safe on all matching packets ... ... It seems that we can simply enlarge the MAX_ACTIONS_BUFSIZE to fix it, but this is not the root cause. For example, for a vxlan output action, we need about 60 bytes for the nlattr, but after it is converted to the flow action, it only occupies 24 bytes. This means that we can still support more than 1000 vxlan output actions for a single datapath flow under the the current 32k max limitation. So even if the nla_len(attr) is larger than MAX_ACTIONS_BUFSIZE, we shouldn't report EINVAL and keep it move on, as the judgement can be done by the reserve_sfa_size. Signed-off-by: zhangliping Acked-by: Pravin B Shelar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/openvswitch/flow_netlink.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 07925418c2a5a..1668916bdbde1 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1789,14 +1789,11 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) #define MAX_ACTIONS_BUFSIZE (32 * 1024) -static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) +static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - if (size > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); - return ERR_PTR(-EINVAL); - } + WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); if (!sfa) @@ -1869,12 +1866,15 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = ksize(*sfa) * 2; if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + OVS_NLERR(log, "Flow action size exceeds max %u", + MAX_ACTIONS_BUFSIZE); return ERR_PTR(-EMSGSIZE); + } new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = nla_alloc_flow_actions(new_acts_size, log); + acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return (void *)acts; @@ -2500,7 +2500,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, { int err; - *sfa = nla_alloc_flow_actions(nla_len(attr), log); + *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); From 030d4676a2681a0656c205a9e58b9624a74a16f4 Mon Sep 17 00:00:00 2001 From: Chun-Yeow Yeoh Date: Tue, 14 Nov 2017 23:20:05 +0800 Subject: [PATCH 101/247] mac80211: fix the update of path metric for RANN frame [ Upstream commit fbbdad5edf0bb59786a51b94a9d006bc8c2da9a2 ] The previous path metric update from RANN frame has not considered the own link metric toward the transmitting mesh STA. Fix this. Reported-by: Michael65535 Signed-off-by: Chun-Yeow Yeoh Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mac80211/mesh_hwmp.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index b747c9645e432..fed598a202c89 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -788,7 +788,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, struct mesh_path *mpath; u8 ttl, flags, hopcount; const u8 *orig_addr; - u32 orig_sn, metric, metric_txsta, interval; + u32 orig_sn, new_metric, orig_metric, last_hop_metric, interval; bool root_is_gate; ttl = rann->rann_ttl; @@ -799,7 +799,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, interval = le32_to_cpu(rann->rann_interval); hopcount = rann->rann_hopcount; hopcount++; - metric = le32_to_cpu(rann->rann_metric); + orig_metric = le32_to_cpu(rann->rann_metric); /* Ignore our own RANNs */ if (ether_addr_equal(orig_addr, sdata->vif.addr)) @@ -816,7 +816,10 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, return; } - metric_txsta = airtime_link_metric_get(local, sta); + last_hop_metric = airtime_link_metric_get(local, sta); + new_metric = orig_metric + last_hop_metric; + if (new_metric < orig_metric) + new_metric = MAX_METRIC; mpath = mesh_path_lookup(sdata, orig_addr); if (!mpath) { @@ -829,7 +832,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, } if (!(SN_LT(mpath->sn, orig_sn)) && - !(mpath->sn == orig_sn && metric < mpath->rann_metric)) { + !(mpath->sn == orig_sn && new_metric < mpath->rann_metric)) { rcu_read_unlock(); return; } @@ -847,7 +850,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, } mpath->sn = orig_sn; - mpath->rann_metric = metric + metric_txsta; + mpath->rann_metric = new_metric; mpath->is_root = true; /* Recording RANNs sender address to send individually * addressed PREQs destined for root mesh STA */ @@ -867,7 +870,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr, orig_sn, 0, NULL, 0, broadcast_addr, hopcount, ttl, interval, - metric + metric_txsta, 0, sdata); + new_metric, 0, sdata); } rcu_read_unlock(); From 8cfb3965ebcd9e02365f8c8e9f4c9f19fd0be004 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 15 Nov 2017 16:20:52 -0500 Subject: [PATCH 102/247] btrfs: fix deadlock when writing out space cache [ Upstream commit b77000ed558daa3bef0899d29bf171b8c9b5e6a8 ] If we fail to prepare our pages for whatever reason (out of memory in our case) we need to make sure to drop the block_group->data_rwsem, otherwise hilarity ensues. Signed-off-by: Josef Bacik Reviewed-by: Omar Sandoval Reviewed-by: Liu Bo Reviewed-by: David Sterba [ add label and use existing unlocking code ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/free-space-cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e4b48f377d3a9..c56253a1e5b49 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1253,7 +1253,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, inode, 0); if (ret) - goto out; + goto out_unlock; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); @@ -1346,6 +1346,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); +out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); From 90ef2c30ebd358005d892aa2895bdcbc027ddce1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 30 Oct 2017 11:20:15 -0400 Subject: [PATCH 103/247] reiserfs: remove unneeded i_version bump [ Upstream commit 9f97df50c52c2887432debb6238f4e43567386a5 ] The i_version field in reiserfs is not initialized and is only ever updated here. Nothing ever views it, so just remove it. Signed-off-by: Jeff Layton Signed-off-by: Jan Kara Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0a6ad4e71e88d..e101d70d2327b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2521,7 +2521,6 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); mark_inode_dirty(inode); return len - towrite; From b0fa04e8429ed74b6156b8fbf329a022e701072e Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 20 Nov 2017 14:55:05 -0800 Subject: [PATCH 104/247] KVM: X86: Fix softlockup when get the current kvmclock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e70b57a6ce4e8b92a56a615ae79bdb2bd66035e7 ] watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [qemu-system-x86:10185] CPU: 6 PID: 10185 Comm: qemu-system-x86 Tainted: G OE 4.14.0-rc4+ #4 RIP: 0010:kvm_get_time_scale+0x4e/0xa0 [kvm] Call Trace: get_time_ref_counter+0x5a/0x80 [kvm] kvm_hv_process_stimers+0x120/0x5f0 [kvm] kvm_arch_vcpu_ioctl_run+0x4b4/0x1690 [kvm] kvm_vcpu_ioctl+0x33a/0x620 [kvm] do_vfs_ioctl+0xa1/0x5d0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x1e/0xa9 This can be reproduced when running kvm-unit-tests/hyperv_stimer.flat and cpu-hotplug stress simultaneously. __this_cpu_read(cpu_tsc_khz) returns 0 (set in kvmclock_cpu_down_prep()) when the pCPU is unhotplug which results in kvm_get_time_scale() gets into an infinite loop. This patch fixes it by treating the unhotplug pCPU as not using master clock. Reviewed-by: Radim Krčmář Reviewed-by: David Hildenbrand Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/x86.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c11a98860d10..e023ef981feb8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1751,10 +1751,13 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, - &hv_clock.tsc_shift, - &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + if (__this_cpu_read(cpu_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + ret = __pvclock_read_cycles(&hv_clock, rdtsc()); + } else + ret = ktime_get_boot_ns() + ka->kvmclock_offset; put_cpu(); From 5c0b19bd8cffec6822c92af223814712bdde42bd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 20 Nov 2017 14:52:21 -0800 Subject: [PATCH 105/247] KVM: VMX: Fix rflags cache during vCPU reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c37c28730bb031cc8a44a130c2555c0f3efbe2d0 ] Reported by syzkaller: *** Guest State *** CR0: actual=0x0000000080010031, shadow=0x0000000060000010, gh_mask=fffffffffffffff7 CR4: actual=0x0000000000002061, shadow=0x0000000000000000, gh_mask=ffffffffffffe8f1 CR3 = 0x000000002081e000 RSP = 0x000000000000fffa RIP = 0x0000000000000000 RFLAGS=0x00023000 DR7 = 0x00000000000000 ^^^^^^^^^^ ------------[ cut here ]------------ WARNING: CPU: 6 PID: 24431 at /home/kernel/linux/arch/x86/kvm//x86.c:7302 kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm] CPU: 6 PID: 24431 Comm: reprotest Tainted: G W OE 4.14.0+ #26 RIP: 0010:kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm] RSP: 0018:ffff880291d179e0 EFLAGS: 00010202 Call Trace: kvm_vcpu_ioctl+0x479/0x880 [kvm] do_vfs_ioctl+0x142/0x9a0 SyS_ioctl+0x74/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a The failed vmentry is triggered by the following beautified testcase: #include #include #include #include #include #include #include long r[5]; int main() { struct kvm_debugregs dr = { 0 }; r[2] = open("/dev/kvm", O_RDONLY); r[3] = ioctl(r[2], KVM_CREATE_VM, 0); r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); struct kvm_guest_debug debug = { .control = 0xf0403, .arch = { .debugreg[6] = 0x2, .debugreg[7] = 0x2 } }; ioctl(r[4], KVM_SET_GUEST_DEBUG, &debug); ioctl(r[4], KVM_RUN, 0); } which testcase tries to setup the processor specific debug registers and configure vCPU for handling guest debug events through KVM_SET_GUEST_DEBUG. The KVM_SET_GUEST_DEBUG ioctl will get and set rflags in order to set TF bit if single step is needed. All regs' caches are reset to avail and GUEST_RFLAGS vmcs field is reset to 0x2 during vCPU reset. However, the cache of rflags is not reset during vCPU reset. The function vmx_get_rflags() returns an unreset rflags cache value since the cache is marked avail, it is 0 after boot. Vmentry fails if the rflags reserved bit 1 is 0. This patch fixes it by resetting both the GUEST_RFLAGS vmcs field and its cache to 0x2 during vCPU reset. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Reviewed-by: David Hildenbrand Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Nadav Amit Cc: Dmitry Vyukov Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b9673bc82caa4..178a344f55f85 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5194,7 +5194,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write64(GUEST_IA32_DEBUGCTL, 0); } - vmcs_writel(GUEST_RFLAGS, 0x02); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); vmcs_writel(GUEST_GDTR_BASE, 0); From b5bfda0f8e29c7070f19c0cac4005fd85f37a84e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2017 12:21:07 -0800 Subject: [PATCH 106/247] xfs: always free inline data before resetting inode fork during ifree [ Upstream commit 98c4f78dcdd8cec112d1cbc5e9a792ee6e5ab7a6 ] In xfs_ifree, we reset the data/attr forks to extents format without bothering to free any inline data buffer that might still be around after all the blocks have been truncated off the file. Prior to commit 43518812d2 ("xfs: remove support for inlining data/extents into the inode fork") nobody noticed because the leftover inline data after truncation was small enough to fit inside the inline buffer inside the fork itself. However, now that we've removed the inline buffer, we /always/ have to free the inline data buffer or else we leak them like crazy. This test was found by turning on kmemleak for generic/001 or generic/388. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_inode.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 98ca9f1b6a07c..c5f2f1e3cc4bd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2429,6 +2429,24 @@ xfs_ifree_cluster( return 0; } +/* + * Free any local-format buffers sitting around before we reset to + * extents format. + */ +static inline void +xfs_ifree_local_data( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); +} + /* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have @@ -2466,6 +2484,9 @@ xfs_ifree( if (error) return error; + xfs_ifree_local_data(ip, XFS_DATA_FORK); + xfs_ifree_local_data(ip, XFS_ATTR_FORK); + VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; From c6a34556f539c69366bd7bcbeff490a6949f95ce Mon Sep 17 00:00:00 2001 From: Eduardo Otubo Date: Thu, 23 Nov 2017 15:18:35 +0100 Subject: [PATCH 107/247] xen-netfront: remove warning when unloading module [ Upstream commit 5b5971df3bc2775107ddad164018a8a8db633b81 ] v2: * Replace busy wait with wait_event()/wake_up_all() * Cannot garantee that at the time xennet_remove is called, the xen_netback state will not be XenbusStateClosed, so added a condition for that * There's a small chance for the xen_netback state is XenbusStateUnknown by the time the xen_netfront switches to Closed, so added a condition for that. When unloading module xen_netfront from guest, dmesg would output warning messages like below: [ 105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use! [ 105.236839] deferring g.e. 0x903 (pfn 0x35805) This problem relies on netfront and netback being out of sync. By the time netfront revokes the g.e.'s netback didn't have enough time to free all of them, hence displaying the warnings on dmesg. The trick here is to make netfront to wait until netback frees all the g.e.'s and only then continue to cleanup for the module removal, and this is done by manipulating both device states. Signed-off-by: Eduardo Otubo Acked-by: Juergen Gross Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netfront.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 8d498a997e251..1a9dadf7b3cc3 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -86,6 +86,8 @@ struct netfront_cb { /* IRQ name is queue name with "-tx" or "-rx" appended */ #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3) +static DECLARE_WAIT_QUEUE_HEAD(module_unload_q); + struct netfront_stats { u64 packets; u64 bytes; @@ -2051,10 +2053,12 @@ static void netback_changed(struct xenbus_device *dev, break; case XenbusStateClosed: + wake_up_all(&module_unload_q); if (dev->state == XenbusStateClosed) break; /* Missed the backend's CLOSING state -- fallthrough */ case XenbusStateClosing: + wake_up_all(&module_unload_q); xenbus_frontend_closed(dev); break; } @@ -2160,6 +2164,20 @@ static int xennet_remove(struct xenbus_device *dev) dev_dbg(&dev->dev, "%s\n", dev->nodename); + if (xenbus_read_driver_state(dev->otherend) != XenbusStateClosed) { + xenbus_switch_state(dev, XenbusStateClosing); + wait_event(module_unload_q, + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosing); + + xenbus_switch_state(dev, XenbusStateClosed); + wait_event(module_unload_q, + xenbus_read_driver_state(dev->otherend) == + XenbusStateClosed || + xenbus_read_driver_state(dev->otherend) == + XenbusStateUnknown); + } + xennet_disconnect_backend(info); unregister_netdev(info->netdev); From c57767b60962f4c965064aadc028db55bd0e63fb Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 10 Aug 2017 10:53:53 +0200 Subject: [PATCH 108/247] auxdisplay: img-ascii-lcd: Only build on archs that have IOMEM [ Upstream commit 141cbfba1d0502006463aa80f57c64086226af1a ] This avoids the MODPOST error: ERROR: "devm_ioremap_resource" [drivers/auxdisplay/img-ascii-lcd.ko] undefined! Signed-off-by: Thomas Meyer Acked-by: Randy Dunlap Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/auxdisplay/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig index 10e1b9eee10ea..f03cf1df8d6bc 100644 --- a/drivers/auxdisplay/Kconfig +++ b/drivers/auxdisplay/Kconfig @@ -121,6 +121,7 @@ config CFAG12864B_RATE config IMG_ASCII_LCD tristate "Imagination Technologies ASCII LCD Display" + depends on HAS_IOMEM default y if MIPS_MALTA || MIPS_SEAD3 select SYSCON help From 5cd3586ca8d4347d2cd658062c994b9edd09ab59 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Nov 2017 08:00:12 -0400 Subject: [PATCH 109/247] nfsd: CLOSE SHOULD return the invalid special stateid for NFSv4.x (x>0) [ Upstream commit fb500a7cfee7f2f447d2bbf30cb59629feab6ac1 ] Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9ebb2d7c81821..7f3a11ab41917 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -63,6 +63,9 @@ static const stateid_t zero_stateid = { static const stateid_t currentstateid = { .si_generation = 1, }; +static const stateid_t close_stateid = { + .si_generation = 0xffffffffU, +}; static u64 current_sessionid = 1; @@ -5407,6 +5410,11 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_close_open_stateid(stp); mutex_unlock(&stp->st_mutex); + /* See RFC5661 sectionm 18.2.4 */ + if (stp->st_stid.sc_client->cl_minorversion) + memcpy(&close->cl_stateid, &close_stateid, + sizeof(close->cl_stateid)); + /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); out: From 2a7d4a723d2e04f9d155d2dcf82ed64d3acc7b10 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Nov 2017 08:00:15 -0400 Subject: [PATCH 110/247] nfsd: Ensure we check stateid validity in the seqid operation checks [ Upstream commit 9271d7e509c1bfc0b9a418caec29ec8d1ac38270 ] After taking the stateid st_mutex, we want to know that the stateid still represents valid state before performing any non-idempotent actions. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7f3a11ab41917..799c18cba6a5d 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5178,15 +5178,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID - || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) - /* - * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step, and - * revoked delegations are kept only for free_stateid. - */ - return nfserr_bad_stateid; - mutex_lock(&stp->st_mutex); + status = nfsd4_lock_ol_stateid(stp); + if (status != nfs_ok) + return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); From f35ab8e2eeb8bc9492cbaf4653e04eea9cb63648 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 6 Nov 2017 16:22:48 +0300 Subject: [PATCH 111/247] grace: replace BUG_ON by WARN_ONCE in exit_net hook [ Upstream commit b872285751c1af010e12d02bce7069e2061a58ca ] Signed-off-by: Vasily Averin Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfs_common/grace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index fd8c9a5bcac44..e280e9c9ebf33 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -104,7 +104,9 @@ grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); - BUG_ON(!list_empty(grace_list)); + WARN_ONCE(!list_empty(grace_list), + "net %x %s: grace_list is not empty\n", + net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { From f25e222ccc50b0b52da98706c1e7da4ec090c4d7 Mon Sep 17 00:00:00 2001 From: Andrew Elble Date: Thu, 9 Nov 2017 13:41:10 -0500 Subject: [PATCH 112/247] nfsd: check for use of the closed special stateid [ Upstream commit ae254dac721d44c0bfebe2795df87459e2e88219 ] Prevent the use of the closed (invalid) special stateid by clients. Signed-off-by: Andrew Elble Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 799c18cba6a5d..c548190ef5ef0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -72,6 +72,7 @@ static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) +#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); @@ -4869,7 +4870,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) struct nfs4_stid *s; __be32 status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { @@ -4927,7 +4929,8 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, else if (typemask & NFS4_DELEG_STID) typemask |= NFS4_REVOKED_DELEG_STID; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { From 3b7742374f3e4d2a9891b02b13c69796d7bda090 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 13 Nov 2017 07:25:40 +0300 Subject: [PATCH 113/247] lockd: fix "list_add double add" caused by legacy signal interface [ Upstream commit 81833de1a46edce9ca20cfe079872ac1c20ef359 ] restart_grace() uses hardcoded init_net. It can cause to "list_add double add" in following scenario: 1) nfsd and lockd was started in several net namespaces 2) nfsd in init_net was stopped (lockd was not stopped because it have users from another net namespaces) 3) lockd got signal, called restart_grace() -> set_grace_period() and enabled lock_manager in hardcoded init_net. 4) nfsd in init_net is started again, its lockd_up() calls set_grace_period() and tries to add lock_manager into init_net 2nd time. Jeff Layton suggest: "Make it safe to call locks_start_grace multiple times on the same lock_manager. If it's already on the global grace_list, then don't try to add it again. (But we don't intentionally add twice, so for now we WARN about that case.) With this change, we also need to ensure that the nfsd4 lock manager initializes the list before we call locks_start_grace. While we're at it, move the rest of the nfsd_net initialization into nfs4_state_create_net. I see no reason to have it spread over two functions like it is today." Suggested patch was updated to generate warning in described situation. Suggested-by: Jeff Layton Signed-off-by: Vasily Averin Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/nfs_common/grace.c | 6 +++++- fs/nfsd/nfs4state.c | 7 ++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index e280e9c9ebf33..77d136ac89099 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm) struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); - list_add(&lm->list, grace_list); + if (list_empty(&lm->list)) + list_add(&lm->list, grace_list); + else + WARN(1, "double list_add attempt detected in net %x %s\n", + net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c548190ef5ef0..f463c4e0b2ea5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7012,6 +7012,10 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; + nn->boot_time = get_seconds(); + nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; + INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); @@ -7069,9 +7073,6 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nn->boot_time = get_seconds(); - nn->grace_ended = false; - nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", From d5a746cf47975a30a8ffaa799db93881512530e8 Mon Sep 17 00:00:00 2001 From: Robert Lippert Date: Mon, 27 Nov 2017 15:51:55 -0800 Subject: [PATCH 114/247] hwmon: (pmbus) Use 64bit math for DIRECT format values [ Upstream commit bd467e4eababe4c04272c1e646f066db02734c79 ] Power values in the 100s of watt range can easily blow past 32bit math limits when processing everything in microwatts. Use 64bit math instead to avoid these issues on common 32bit ARM BMC platforms. Fixes: 442aba78728e ("hwmon: PMBus device driver") Signed-off-by: Robert Lippert Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/pmbus/pmbus_core.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c index ba59eaef2e075..d013acf3f83a3 100644 --- a/drivers/hwmon/pmbus/pmbus_core.c +++ b/drivers/hwmon/pmbus/pmbus_core.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -476,8 +477,8 @@ static long pmbus_reg2data_linear(struct pmbus_data *data, static long pmbus_reg2data_direct(struct pmbus_data *data, struct pmbus_sensor *sensor) { - long val = (s16) sensor->data; - long m, b, R; + s64 b, val = (s16)sensor->data; + s32 m, R; m = data->info->m[sensor->class]; b = data->info->b[sensor->class]; @@ -505,11 +506,12 @@ static long pmbus_reg2data_direct(struct pmbus_data *data, R--; } while (R < 0) { - val = DIV_ROUND_CLOSEST(val, 10); + val = div_s64(val + 5LL, 10L); /* round closest */ R++; } - return (val - b) / m; + val = div_s64(val - b, m); + return clamp_val(val, LONG_MIN, LONG_MAX); } /* @@ -629,7 +631,8 @@ static u16 pmbus_data2reg_linear(struct pmbus_data *data, static u16 pmbus_data2reg_direct(struct pmbus_data *data, struct pmbus_sensor *sensor, long val) { - long m, b, R; + s64 b, val64 = val; + s32 m, R; m = data->info->m[sensor->class]; b = data->info->b[sensor->class]; @@ -646,18 +649,18 @@ static u16 pmbus_data2reg_direct(struct pmbus_data *data, R -= 3; /* Adjust R and b for data in milli-units */ b *= 1000; } - val = val * m + b; + val64 = val64 * m + b; while (R > 0) { - val *= 10; + val64 *= 10; R--; } while (R < 0) { - val = DIV_ROUND_CLOSEST(val, 10); + val64 = div_s64(val64 + 5LL, 10L); /* round closest */ R++; } - return val; + return (u16)clamp_val(val64, S16_MIN, S16_MAX); } static u16 pmbus_data2reg_vid(struct pmbus_data *data, From 98ae1ca7534e47508719ee54657f01df55437f62 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 21 Nov 2017 20:46:49 +0100 Subject: [PATCH 115/247] bnxt_en: Fix an error handling path in 'bnxt_get_module_eeprom()' [ Upstream commit dea521a2b9f96e905fa2bb2f95e23ec00c2ec436 ] Error code returned by 'bnxt_read_sfp_module_eeprom_info()' is handled a few lines above when reading the A0 portion of the EEPROM. The same should be done when reading the A2 portion of the EEPROM. In order to correctly propagate an error, update 'rc' in this 2nd call as well, otherwise 0 (success) is returned. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index a7e04ff4eaede..cde4b96f3153f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1843,8 +1843,8 @@ static int bnxt_get_module_eeprom(struct net_device *dev, /* Read A2 portion of the EEPROM */ if (length) { start -= ETH_MODULE_SFF_8436_LEN; - bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A2, 1, start, - length, data); + rc = bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A2, 1, + start, length, data); } return rc; } From 54a1fdff1b09465d4f5f06bff53fe2b1e0434673 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 23 Nov 2017 17:13:40 +0100 Subject: [PATCH 116/247] xfs: fortify xfs_alloc_buftarg error handling [ Upstream commit d210a9874b8f6166579408131cb74495caff1958 ] percpu_counter_init failure path doesn't clean up &btp->bt_lru list. Call list_lru_destroy in that error path. Similarly register_shrinker error path is not handled. While it is unlikely to trigger these error path, it is not impossible especially the later might fail with large NUMAs. Let's handle the failure to make the code more robust. Noticed-by: Tetsuo Handa Signed-off-by: Michal Hocko Acked-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_buf.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index eca7baecc9f08..3f45d9867e10a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1785,22 +1785,27 @@ xfs_alloc_buftarg( btp->bt_bdi = blk_get_backing_dev_info(bdev); if (xfs_setsize_buftarg_early(btp, bdev)) - goto error; + goto error_free; if (list_lru_init(&btp->bt_lru)) - goto error; + goto error_free; if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error; + goto error_lru; btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - register_shrinker(&btp->bt_shrinker); + if (register_shrinker(&btp->bt_shrinker)) + goto error_pcpu; return btp; -error: +error_pcpu: + percpu_counter_destroy(&btp->bt_io_count); +error_lru: + list_lru_destroy(&btp->bt_lru); +error_free: kmem_free(btp); return NULL; } From e11616d5e6c3209196c697e773cb41362419adf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 24 Nov 2017 11:39:30 +0100 Subject: [PATCH 117/247] drm/amdgpu: don't try to move pinned BOs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6edc6910ba4cd6eab309263539c8f09b8ad772bf ] Never try to move pinned BOs during CS. Signed-off-by: Christian König Reviewed-by: Michel Dänzer Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index f26d1fd53bef1..cb505f66d3aab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -416,6 +416,10 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p, if (candidate == lobj) break; + /* We can't move pinned BOs here */ + if (bo->pin_count) + continue; + other = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); /* Check if this BO is in one of the domains we need space for */ From d47907bcac94cc455a62d50e6c71ae5341cee0ea Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 Nov 2017 11:01:09 +0100 Subject: [PATCH 118/247] net: ethernet: xilinx: Mark XILINX_LL_TEMAC broken on 64-bit [ Upstream commit 15bfe05c8d6386f1a90e9340d15336e85e32aad6 ] On 64-bit (e.g. powerpc64/allmodconfig): drivers/net/ethernet/xilinx/ll_temac_main.c: In function 'temac_start_xmit_done': drivers/net/ethernet/xilinx/ll_temac_main.c:633:22: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] dev_kfree_skb_irq((struct sk_buff *)cur_p->app4); ^ cdmac_bd.app4 is u32, so it is too small to hold a kernel pointer. Note that several other fields in struct cdmac_bd are also too small to hold physical addresses on 64-bit platforms. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/xilinx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/xilinx/Kconfig b/drivers/net/ethernet/xilinx/Kconfig index 6d68c8a8f4f2a..da4ec575ccf9b 100644 --- a/drivers/net/ethernet/xilinx/Kconfig +++ b/drivers/net/ethernet/xilinx/Kconfig @@ -34,6 +34,7 @@ config XILINX_AXI_EMAC config XILINX_LL_TEMAC tristate "Xilinx LL TEMAC (LocalLink Tri-mode Ethernet MAC) driver" depends on (PPC || MICROBLAZE) + depends on !64BIT || BROKEN select PHYLIB ---help--- This driver supports the Xilinx 10/100/1000 LocalLink TEMAC From c4ecc2f696430bbd6c4b8bf3f114ad9a86e2c9b1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 29 Nov 2017 22:34:50 +0900 Subject: [PATCH 119/247] quota: Check for register_shrinker() failure. [ Upstream commit 88bc0ede8d35edc969350852894dc864a2dc1859 ] register_shrinker() might return -ENOMEM error since Linux 3.12. Call panic() as with other failure checks in this function if register_shrinker() failed. Fixes: 1d3d4437eae1 ("vmscan: per-node deferred work") Signed-off-by: Tetsuo Handa Cc: Jan Kara Cc: Michal Hocko Reviewed-by: Michal Hocko Signed-off-by: Jan Kara Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/quota/dquot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1bfac28b7e7df..f9246ac4eef81 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2985,7 +2985,8 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - register_shrinker(&dqcache_shrinker); + if (register_shrinker(&dqcache_shrinker)) + panic("Cannot register dquot shrinker"); return 0; } From d2a67f7afcad12bc958f947118e94d5af05aa218 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 24 Nov 2017 12:00:24 -0500 Subject: [PATCH 120/247] SUNRPC: Allow connect to return EHOSTUNREACH [ Upstream commit 4ba161a793d5f43757c35feff258d9f20a082940 ] Reported-by: Dmitry Vyukov Signed-off-by: Trond Myklebust Tested-by: Dmitry Vyukov Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e01c825bc6833..d24d14ea8ba41 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2381,6 +2381,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: /* retry with existing socket, after a delay */ From 7b8623841f2b3845eaa5292f0b15e7f9dd0c5f42 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 29 Nov 2017 16:11:08 -0800 Subject: [PATCH 121/247] kmemleak: add scheduling point to kmemleak_scan() [ Upstream commit bde5f6bc68db51128f875a756e9082a6c6ff7b4c ] kmemleak_scan() will scan struct page for each node and it can be really large and resulting in a soft lockup. We have seen a soft lockup when do scan while compile kernel: watchdog: BUG: soft lockup - CPU#53 stuck for 22s! [bash:10287] [...] Call Trace: kmemleak_scan+0x21a/0x4c0 kmemleak_write+0x312/0x350 full_proxy_write+0x5a/0xa0 __vfs_write+0x33/0x150 vfs_write+0xad/0x1a0 SyS_write+0x52/0xc0 do_syscall_64+0x61/0x1a0 entry_SYSCALL64_slow_path+0x25/0x25 Fix this by adding cond_resched every MAX_SCAN_SIZE. Link: http://lkml.kernel.org/r/1511439788-20099-1-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Suggested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- mm/kmemleak.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d1380ed93fdf0..20cf3be9a5e82 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1442,6 +1442,8 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); + if (!(pfn % (MAX_SCAN_SIZE / sizeof(*page)))) + cond_resched(); } } put_online_mems(); From 5f6a0441ca0d5dbf64ee56743028b8a9ca932e48 Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 7 Nov 2017 19:56:19 +0300 Subject: [PATCH 122/247] drm/bridge: tc358767: do no fail on hi-res displays [ Upstream commit cffd2b16c01c3431a7a7dd62e722af33490fc436 ] Do not fail data rates higher than 2.7 and more than 2 lanes. Try to fall back to 2.7Gbps and 2 lanes. Acked-by: Philipp Zabel Reviewed-by: Andrzej Hajda Signed-off-by: Andrey Gusakov Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-2-git-send-email-andrey.gusakov@cogentembedded.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358767.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 44d476ea6d2e0..14b4e58f727fb 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -603,8 +603,15 @@ static int tc_get_display_props(struct tc_data *tc) ret = drm_dp_link_probe(&tc->aux, &tc->link.base); if (ret < 0) goto err_dpcd_read; - if ((tc->link.base.rate != 162000) && (tc->link.base.rate != 270000)) - goto err_dpcd_inval; + if (tc->link.base.rate != 162000 && tc->link.base.rate != 270000) { + dev_dbg(tc->dev, "Falling to 2.7 Gbps rate\n"); + tc->link.base.rate = 270000; + } + + if (tc->link.base.num_lanes > 2) { + dev_dbg(tc->dev, "Falling to 2 lanes\n"); + tc->link.base.num_lanes = 2; + } ret = drm_dp_dpcd_readb(&tc->aux, DP_MAX_DOWNSPREAD, tmp); if (ret < 0) @@ -637,9 +644,6 @@ static int tc_get_display_props(struct tc_data *tc) err_dpcd_read: dev_err(tc->dev, "failed to read DPCD: %d\n", ret); return ret; -err_dpcd_inval: - dev_err(tc->dev, "invalid DPCD\n"); - return -EINVAL; } static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) From 8d4bfe89aacf5dbe66fa6dd26587c7d11a9f6a7a Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 7 Nov 2017 19:56:20 +0300 Subject: [PATCH 123/247] drm/bridge: tc358767: filter out too high modes [ Upstream commit 99fc8e963a4c0203dba26a77cf737db6081bca14 ] Pixel clock limitation for DPI is 154 MHz. Do not accept modes with higher pixel clock rate. Reviewed-by: Andrzej Hajda Signed-off-by: Andrey Gusakov Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-3-git-send-email-andrey.gusakov@cogentembedded.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358767.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 14b4e58f727fb..97087f9623638 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -1109,7 +1109,10 @@ static bool tc_bridge_mode_fixup(struct drm_bridge *bridge, static int tc_connector_mode_valid(struct drm_connector *connector, struct drm_display_mode *mode) { - /* Accept any mode */ + /* DPI interface clock limitation: upto 154 MHz */ + if (mode->clock > 154000) + return MODE_CLOCK_HIGH; + return MODE_OK; } From c55908604eccc659840e52a2c17614cfe90178bb Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 7 Nov 2017 19:56:21 +0300 Subject: [PATCH 124/247] drm/bridge: tc358767: fix DP0_MISC register set [ Upstream commit f3b8adbe1911f66fd3cab1aaa74f0f66b7ceda25 ] Remove shift from TU_SIZE_RECOMMENDED define as it used to calculate max_tu_symbols. Acked-by: Philipp Zabel Signed-off-by: Andrey Gusakov Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-4-git-send-email-andrey.gusakov@cogentembedded.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358767.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 97087f9623638..4d348800222eb 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -97,7 +97,7 @@ #define DP0_ACTIVEVAL 0x0650 #define DP0_SYNCVAL 0x0654 #define DP0_MISC 0x0658 -#define TU_SIZE_RECOMMENDED (0x3f << 16) /* LSCLK cycles per TU */ +#define TU_SIZE_RECOMMENDED (63) /* LSCLK cycles per TU */ #define BPC_6 (0 << 5) #define BPC_8 (1 << 5) @@ -716,7 +716,8 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) * Must be less than tu_size. */ max_tu_symbol = TU_SIZE_RECOMMENDED - 1; - tc_write(DP0_MISC, (max_tu_symbol << 23) | TU_SIZE_RECOMMENDED | BPC_8); + tc_write(DP0_MISC, (max_tu_symbol << 23) | (TU_SIZE_RECOMMENDED << 16) | + BPC_8); return 0; err: From 1bdfc52c331a149e393fb199f7e361fcb8e3c9ad Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 7 Nov 2017 19:56:22 +0300 Subject: [PATCH 125/247] drm/bridge: tc358767: fix timing calculations [ Upstream commit 66d1c3b94d5d59e4325e61a78d520f92c043d645 ] Fields in HTIM01 and HTIM02 regs should be even. Recomended thresh_dly value is max_tu_symbol. Remove set of VPCTRL0.VSDELAY as it is related to DSI input interface. Currently driver supports only DPI. Acked-by: Philipp Zabel Signed-off-by: Andrey Gusakov Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-5-git-send-email-andrey.gusakov@cogentembedded.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358767.c | 34 ++++++++++++++++++------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 4d348800222eb..821b93f68eeff 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -659,6 +659,14 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) int lower_margin = mode->vsync_start - mode->vdisplay; int vsync_len = mode->vsync_end - mode->vsync_start; + /* + * Recommended maximum number of symbols transferred in a transfer unit: + * DIV_ROUND_UP((input active video bandwidth in bytes) * tu_size, + * (output active video bandwidth in bytes)) + * Must be less than tu_size. + */ + max_tu_symbol = TU_SIZE_RECOMMENDED - 1; + dev_dbg(tc->dev, "set mode %dx%d\n", mode->hdisplay, mode->vdisplay); dev_dbg(tc->dev, "H margin %d,%d sync %d\n", @@ -668,13 +676,18 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) dev_dbg(tc->dev, "total: %dx%d\n", mode->htotal, mode->vtotal); - /* LCD Ctl Frame Size */ - tc_write(VPCTRL0, (0x40 << 20) /* VSDELAY */ | + /* + * LCD Ctl Frame Size + * datasheet is not clear of vsdelay in case of DPI + * assume we do not need any delay when DPI is a source of + * sync signals + */ + tc_write(VPCTRL0, (0 << 20) /* VSDELAY */ | OPXLFMT_RGB888 | FRMSYNC_DISABLED | MSF_DISABLED); - tc_write(HTIM01, (left_margin << 16) | /* H back porch */ - (hsync_len << 0)); /* Hsync */ - tc_write(HTIM02, (right_margin << 16) | /* H front porch */ - (mode->hdisplay << 0)); /* width */ + tc_write(HTIM01, (ALIGN(left_margin, 2) << 16) | /* H back porch */ + (ALIGN(hsync_len, 2) << 0)); /* Hsync */ + tc_write(HTIM02, (ALIGN(right_margin, 2) << 16) | /* H front porch */ + (ALIGN(mode->hdisplay, 2) << 0)); /* width */ tc_write(VTIM01, (upper_margin << 16) | /* V back porch */ (vsync_len << 0)); /* Vsync */ tc_write(VTIM02, (lower_margin << 16) | /* V front porch */ @@ -693,7 +706,7 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) /* DP Main Stream Attributes */ vid_sync_dly = hsync_len + left_margin + mode->hdisplay; tc_write(DP0_VIDSYNCDELAY, - (0x003e << 16) | /* thresh_dly */ + (max_tu_symbol << 16) | /* thresh_dly */ (vid_sync_dly << 0)); tc_write(DP0_TOTALVAL, (mode->vtotal << 16) | (mode->htotal)); @@ -709,13 +722,6 @@ static int tc_set_video_mode(struct tc_data *tc, struct drm_display_mode *mode) tc_write(DPIPXLFMT, VS_POL_ACTIVE_LOW | HS_POL_ACTIVE_LOW | DE_POL_ACTIVE_HIGH | SUB_CFG_TYPE_CONFIG1 | DPI_BPP_RGB888); - /* - * Recommended maximum number of symbols transferred in a transfer unit: - * DIV_ROUND_UP((input active video bandwidth in bytes) * tu_size, - * (output active video bandwidth in bytes)) - * Must be less than tu_size. - */ - max_tu_symbol = TU_SIZE_RECOMMENDED - 1; tc_write(DP0_MISC, (max_tu_symbol << 23) | (TU_SIZE_RECOMMENDED << 16) | BPC_8); From 8ae615fecee51e7ec7ee62898b2d7072764e18d6 Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 7 Nov 2017 19:56:23 +0300 Subject: [PATCH 126/247] drm/bridge: tc358767: fix AUXDATAn registers access [ Upstream commit 9217c1abbc145a77d65c476cf2004a3df02104c7 ] First four bytes should go to DP0_AUXWDATA0. Due to bug if len > 4 first four bytes was writen to DP0_AUXWDATA1 and all data get shifted by 4 bytes. Fix it. Acked-by: Philipp Zabel Signed-off-by: Andrey Gusakov Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-6-git-send-email-andrey.gusakov@cogentembedded.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358767.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 821b93f68eeff..6d07728649358 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -318,7 +318,7 @@ static ssize_t tc_aux_transfer(struct drm_dp_aux *aux, tmp = (tmp << 8) | buf[i]; i++; if (((i % 4) == 0) || (i == size)) { - tc_write(DP0_AUXWDATA(i >> 2), tmp); + tc_write(DP0_AUXWDATA((i - 1) >> 2), tmp); tmp = 0; } } From f7170eb80aff7daf2221c94a92ba88dae5f971be Mon Sep 17 00:00:00 2001 From: Andrey Gusakov Date: Tue, 7 Nov 2017 19:56:24 +0300 Subject: [PATCH 127/247] drm/bridge: tc358767: fix 1-lane behavior [ Upstream commit 4dbd6c03fbf88299c573d676838896c6e06aade2 ] Use drm_dp_channel_eq_ok helper Acked-by: Philipp Zabel Signed-off-by: Andrey Gusakov Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1510073785-16108-7-git-send-email-andrey.gusakov@cogentembedded.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358767.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 6d07728649358..f64f35cdc2ff1 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -819,8 +819,6 @@ static int tc_main_link_setup(struct tc_data *tc) unsigned int rate; u32 dp_phy_ctrl; int timeout; - bool aligned; - bool ready; u32 value; int ret; u8 tmp[8]; @@ -965,16 +963,15 @@ static int tc_main_link_setup(struct tc_data *tc) ret = drm_dp_dpcd_read_link_status(aux, tmp + 2); if (ret < 0) goto err_dpcd_read; - ready = (tmp[2] == ((DP_CHANNEL_EQ_BITS << 4) | /* Lane1 */ - DP_CHANNEL_EQ_BITS)); /* Lane0 */ - aligned = tmp[4] & DP_INTERLANE_ALIGN_DONE; - } while ((--timeout) && !(ready && aligned)); + } while ((--timeout) && + !(drm_dp_channel_eq_ok(tmp + 2, tc->link.base.num_lanes))); if (timeout == 0) { /* Read DPCD 0x200-0x201 */ ret = drm_dp_dpcd_read(aux, DP_SINK_COUNT, tmp, 2); if (ret < 0) goto err_dpcd_read; + dev_err(dev, "channel(s) EQ not ok\n"); dev_info(dev, "0x0200 SINK_COUNT: 0x%02x\n", tmp[0]); dev_info(dev, "0x0201 DEVICE_SERVICE_IRQ_VECTOR: 0x%02x\n", tmp[1]); @@ -985,10 +982,6 @@ static int tc_main_link_setup(struct tc_data *tc) dev_info(dev, "0x0206 ADJUST_REQUEST_LANE0_1: 0x%02x\n", tmp[6]); - if (!ready) - dev_err(dev, "Lane0/1 not ready\n"); - if (!aligned) - dev_err(dev, "Lane0/1 not aligned\n"); return -EAGAIN; } From 9301165c46234b06f482589dddd765e33d1136f4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 24 Sep 2017 08:01:03 +0200 Subject: [PATCH 128/247] drm/omap: Fix error handling path in 'omap_dmm_probe()' [ Upstream commit 8677b1ac2db021ab30bb1fa34f1e56ebe0051ec3 ] If we don't find a matching device node, we must free the memory allocated in 'omap_dmm' a few lines above. Fixes: 7cb0d6c17b96 ("drm/omap: fix TILER on OMAP5") Signed-off-by: Christophe JAILLET Signed-off-by: Tomi Valkeinen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/omapdrm/omap_dmm_tiler.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c index 4ceed7a9762f1..4b83e9eeab06d 100644 --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c @@ -638,7 +638,8 @@ static int omap_dmm_probe(struct platform_device *dev) match = of_match_node(dmm_of_match, dev->dev.of_node); if (!match) { dev_err(&dev->dev, "failed to find matching device node\n"); - return -ENODEV; + ret = -ENODEV; + goto fail; } omap_dmm->plat_data = match->data; From d96024440ee5c6126bd080e84e6e3ae5a89a39c5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 27 Nov 2017 09:50:17 -0800 Subject: [PATCH 129/247] xfs: ubsan fixes [ Upstream commit 22a6c83777ac7c17d6c63891beeeac24cf5da450 ] Fix some complaints from the UBSAN about signed integer addition overflows. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_aops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d31cd1ebd8e98..f3acecf3869dd 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -391,7 +391,7 @@ xfs_map_blocks( (ip->i_df.if_flags & XFS_IFEXTENTS)); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + count > mp->m_super->s_maxbytes) + if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes) count = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1295,7 +1295,7 @@ xfs_map_trim_size( if (mapping_size > size) mapping_size = size; if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { + (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) { /* limit mapping to block that spans EOF */ mapping_size = roundup_64(i_size_read(inode) - offset, i_blocksize(inode)); @@ -1347,7 +1347,7 @@ __xfs_get_blocks( lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); - if (offset + size > mp->m_super->s_maxbytes) + if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes) size = mp->m_super->s_maxbytes - offset; end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); From fde77c712ac003f023ea6ba3da30a0284c71ad92 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 28 Nov 2017 08:54:10 -0800 Subject: [PATCH 130/247] xfs: Properly retry failed dquot items in case of error during buffer writeback [ Upstream commit 373b0589dc8d58bc09c9a28d03611ae4fb216057 ] Once the inode item writeback errors is already fixed, it's time to fix the same problem in dquot code. Although there were no reports of users hitting this bug in dquot code (at least none I've seen), the bug is there and I was already planning to fix it when the correct approach to fix the inodes part was decided. This patch aims to fix the same problem in dquot code, regarding failed buffers being unable to be resubmitted once they are flush locked. Tested with the recently test-case sent to fstests list by Hou Tao. Reviewed-by: Brian Foster Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_dquot.c | 14 +++++++++++--- fs/xfs/xfs_dquot_item.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 9d06cc30e875e..7a7b3ccf22731 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1004,14 +1004,22 @@ xfs_qm_dqflush_done( * holding the lock before removing the dquot from the AIL. */ if ((lip->li_flags & XFS_LI_IN_AIL) && - lip->li_lsn == qip->qli_flush_lsn) { + ((lip->li_lsn == qip->qli_flush_lsn) || + (lip->li_flags & XFS_LI_FAILED))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->xa_lock); - if (lip->li_lsn == qip->qli_flush_lsn) + if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); - else + } else { + /* + * Clear the failed state since we are about to drop the + * flush lock + */ + if (lip->li_flags & XFS_LI_FAILED) + xfs_clear_li_failed(lip); spin_unlock(&ailp->xa_lock); + } } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 2c7a1629e064b..664dea105e76f 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -137,6 +137,26 @@ xfs_qm_dqunpin_wait( wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * this informs the AIL that the dquot is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_dquot_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + struct xfs_dquot *dqp; + + dqp = DQUOT_ITEM(lip)->qli_dquot; + ASSERT(!completion_done(&dqp->q_flush)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, @@ -144,13 +164,28 @@ xfs_qm_dquot_logitem_push( __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -242,7 +277,8 @@ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_unlock = xfs_qm_dquot_logitem_unlock, .iop_committed = xfs_qm_dquot_logitem_committed, .iop_push = xfs_qm_dquot_logitem_push, - .iop_committing = xfs_qm_dquot_logitem_committing + .iop_committing = xfs_qm_dquot_logitem_committing, + .iop_error = xfs_dquot_item_error }; /* From fa64914313c1437c7ba843581a0b09bc2870bec3 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 17 Nov 2017 19:14:55 -0200 Subject: [PATCH 131/247] scsi: aacraid: Prevent crash in case of free interrupt during scsi EH path [ Upstream commit e4717292ddebcfe231651b5aff9fa19ca158d178 ] As part of the scsi EH path, aacraid performs a reinitialization of the adapter, which encompass freeing resources and IRQs, NULLifying lots of pointers, and then initialize it all over again. We've identified a problem during the free IRQ portion of this path if CONFIG_DEBUG_SHIRQ is enabled on kernel config file. Happens that, in case this flag was set, right after free_irq() effectively clears the interrupt, it checks if it was requested as IRQF_SHARED. In positive case, it performs another call to the IRQ handler on driver. Problem is: since aacraid currently free some resources *before* freeing the IRQ, once free_irq() path calls the handler again (due to CONFIG_DEBUG_SHIRQ), aacraid crashes due to NULL pointer dereference with the following trace: aac_src_intr_message+0xf8/0x740 [aacraid] __free_irq+0x33c/0x4a0 free_irq+0x78/0xb0 aac_free_irq+0x13c/0x150 [aacraid] aac_reset_adapter+0x2e8/0x970 [aacraid] aac_eh_reset+0x3a8/0x5d0 [aacraid] scsi_try_host_reset+0x74/0x180 scsi_eh_ready_devs+0xc70/0x1510 scsi_error_handler+0x624/0xa20 This patch prevents the crash by changing the order of the deinitialization in this path of aacraid: first we clear the IRQ, then we free other resources. No functional change intended. Signed-off-by: Guilherme G. Piccoli Reviewed-by: Raghava Aditya Renukunta Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/aacraid/commsup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index 0aeecec1f5eaf..e2962f15c189a 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -1416,13 +1416,13 @@ static int _aac_reset_adapter(struct aac_dev *aac, int forced) * will ensure that i/o is queisced and the card is flushed in that * case. */ + aac_free_irq(aac); aac_fib_map_free(aac); pci_free_consistent(aac->pdev, aac->comm_size, aac->comm_addr, aac->comm_phys); aac->comm_addr = NULL; aac->comm_phys = 0; kfree(aac->queues); aac->queues = NULL; - aac_free_irq(aac); kfree(aac->fsa_dev); aac->fsa_dev = NULL; quirks = aac_get_driver_ident(index)->quirks; From a248dc6a55b782ec8a86e3a7ee2750e60f428c45 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 20 Nov 2017 08:12:29 -0600 Subject: [PATCH 132/247] scsi: ufs: ufshcd: fix potential NULL pointer dereference in ufshcd_config_vreg [ Upstream commit 727535903bea924c4f73abb202c4b3e85fff0ca4 ] _vreg_ is being dereferenced before it is null checked, hence there is a potential null pointer dereference. Fix this by moving the pointer dereference after _vreg_ has been null checked. This issue was detected with the help of Coccinelle. Fixes: aa4976130934 ("ufs: Add regulator enable support") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Subhash Jadavani Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/ufs/ufshcd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 530034bc2d13e..2e9341233f666 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -5327,12 +5327,15 @@ static int ufshcd_config_vreg(struct device *dev, struct ufs_vreg *vreg, bool on) { int ret = 0; - struct regulator *reg = vreg->reg; - const char *name = vreg->name; + struct regulator *reg; + const char *name; int min_uV, uA_load; BUG_ON(!vreg); + reg = vreg->reg; + name = vreg->name; + if (regulator_count_voltages(reg) > 0) { min_uV = on ? vreg->min_uV : 0; ret = regulator_set_voltage(reg, min_uV, vreg->max_uV); From 9adb2a0f9a470b59ccca26e07ed279c11200f0db Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 15 Nov 2017 14:12:30 +0200 Subject: [PATCH 133/247] iwlwifi: mvm: fix the TX queue hang timeout for MONITOR vif type [ Upstream commit d1b275ffec459c5ae12b5c7086c84175696e5a9f ] The MONITOR type is missing in the interface type switch. Add it. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/intel/iwlwifi/mvm/utils.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index d04babd99b538..ff5ce1ed03c42 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -1040,6 +1040,8 @@ unsigned int iwl_mvm_get_wd_timeout(struct iwl_mvm *mvm, return le32_to_cpu(txq_timer->p2p_go); case NL80211_IFTYPE_P2P_DEVICE: return le32_to_cpu(txq_timer->p2p_device); + case NL80211_IFTYPE_MONITOR: + return default_timeout; default: WARN_ON(1); return mvm->cfg->base_params->wd_timeout; From c16c193e3abcf6ce98ac6d1e87c796070fc4fbb7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 7 Nov 2017 11:10:29 -0800 Subject: [PATCH 134/247] ARM: dts: NSP: Fix PPI interrupt types [ Upstream commit 5f1aa51c7a1eef1c5a60b8334e32c89904964245 ] Booting a kernel results in the kernel warning us about the following PPI interrupts configuration: [ 0.105127] smp: Bringing up secondary CPUs ... [ 0.110545] GIC: PPI11 is secure or misconfigured [ 0.110551] GIC: PPI13 is secure or misconfigured Fix this by using the appropriate edge configuration for PPI11 and PPI13, this is similar to what was fixed for Northstar (BCM5301X) in commit 0e34079cd1f6 ("ARM: dts: BCM5301X: Correct GIC_PPI interrupt flags"). Fixes: 7b2e987de207 ("ARM: NSP: add minimal Northstar Plus device tree") Fixes: 1a9d53cabaf4 ("ARM: dts: NSP: Add TWD Support to DT") Acked-by: Jon Mason Signed-off-by: Florian Fainelli Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/bcm-nsp.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/bcm-nsp.dtsi b/arch/arm/boot/dts/bcm-nsp.dtsi index 7c9e0fae9bb9b..65e0db1d3bd7b 100644 --- a/arch/arm/boot/dts/bcm-nsp.dtsi +++ b/arch/arm/boot/dts/bcm-nsp.dtsi @@ -85,7 +85,7 @@ timer@20200 { compatible = "arm,cortex-a9-global-timer"; reg = <0x20200 0x100>; - interrupts = ; + interrupts = ; clocks = <&periph_clk>; }; @@ -93,7 +93,7 @@ compatible = "arm,cortex-a9-twd-timer"; reg = <0x20600 0x20>; interrupts = ; + IRQ_TYPE_EDGE_RISING)>; clocks = <&periph_clk>; }; From b4bfc8ef594a459a23034682a464b6212408f3f0 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sun, 16 Apr 2017 02:51:16 -0400 Subject: [PATCH 135/247] media: usbtv: add a new usbid [ Upstream commit 04226916d2360f56d57ad00bc48d2d1854d1e0b0 ] A new usbid of UTV007 is found in a newly bought device. The usbid is 1f71:3301. The ID on the chip is: UTV007 A89029.1 1520L18K1 Both video and audio is tested with the modified usbtv driver. Signed-off-by: Icenowy Zheng Acked-by: Lubomir Rintel Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/usbtv/usbtv-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/usb/usbtv/usbtv-core.c b/drivers/media/usb/usbtv/usbtv-core.c index dc76fd41e00fc..0324633ede42e 100644 --- a/drivers/media/usb/usbtv/usbtv-core.c +++ b/drivers/media/usb/usbtv/usbtv-core.c @@ -141,6 +141,7 @@ static void usbtv_disconnect(struct usb_interface *intf) static struct usb_device_id usbtv_id_table[] = { { USB_DEVICE(0x1b71, 0x3002) }, + { USB_DEVICE(0x1f71, 0x3301) }, {} }; MODULE_DEVICE_TABLE(usb, usbtv_id_table); From 0e216b0a0f74b06da20c03576daf6e75a77209f5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 14 Nov 2017 16:18:28 +0000 Subject: [PATCH 136/247] usb: gadget: don't dereference g until after it has been null checked [ Upstream commit b2fc059fa549fe6881d4c1f8d698b0f50bcd16ec ] Avoid dereferencing pointer g until after g has been sanity null checked; move the assignment of cdev much later when it is required into a more local scope. Detected by CoverityScan, CID#1222135 ("Dereference before null check") Fixes: b785ea7ce662 ("usb: gadget: composite: fix ep->maxburst initialization") Signed-off-by: Colin Ian King Signed-off-by: Felipe Balbi Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 325bf21ba13bb..406758ed0b23e 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -150,7 +150,6 @@ int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f, struct usb_ep *_ep) { - struct usb_composite_dev *cdev = get_gadget_data(g); struct usb_endpoint_descriptor *chosen_desc = NULL; struct usb_descriptor_header **speed_desc = NULL; @@ -229,8 +228,12 @@ int config_ep_by_speed(struct usb_gadget *g, _ep->maxburst = comp_desc->bMaxBurst + 1; break; default: - if (comp_desc->bMaxBurst != 0) + if (comp_desc->bMaxBurst != 0) { + struct usb_composite_dev *cdev; + + cdev = get_gadget_data(g); ERROR(cdev, "ep0 bMaxBurst must be 0\n"); + } _ep->maxburst = 1; break; } From ace1911b7620af13e83621049e3df9a53d38fd4e Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Sat, 25 Nov 2017 13:32:38 -0600 Subject: [PATCH 137/247] staging: rtl8188eu: Fix incorrect response to SIOCGIWESSID [ Upstream commit b77992d2df9e47144354d1b25328b180afa33442 ] When not associated with an AP, wifi device drivers should respond to the SIOCGIWESSID ioctl with a zero-length string for the SSID, which is the behavior expected by dhcpcd. Currently, this driver returns an error code (-1) from the ioctl call, which causes dhcpcd to assume that the device is not a wireless interface and therefore it fails to work correctly with it thereafter. This problem was reported and tested at https://github.com/lwfinger/rtl8188eu/issues/234. Signed-off-by: Larry Finger Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8188eu/os_dep/ioctl_linux.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c index 4de9dbc93380e..c7bf8ab26192f 100644 --- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c @@ -1397,19 +1397,13 @@ static int rtw_wx_get_essid(struct net_device *dev, if ((check_fwstate(pmlmepriv, _FW_LINKED)) || (check_fwstate(pmlmepriv, WIFI_ADHOC_MASTER_STATE))) { len = pcur_bss->Ssid.SsidLength; - - wrqu->essid.length = len; - memcpy(extra, pcur_bss->Ssid.Ssid, len); - - wrqu->essid.flags = 1; } else { - ret = -1; - goto exit; + len = 0; + *extra = 0; } - -exit: - + wrqu->essid.length = len; + wrqu->essid.flags = 1; return ret; } From f94b238fb856432ae84b18ddde23518b5e5e358b Mon Sep 17 00:00:00 2001 From: Stefan Schake Date: Fri, 29 Dec 2017 17:05:43 +0100 Subject: [PATCH 138/247] drm/vc4: Move IRQ enable to PM path [ Upstream commit ce9caf2f79a5aa170a4b6456a03db639eed9c988 ] We were calling enable_irq on bind, where it was already enabled previously by the IRQ helper. Additionally, dev->irq is not set correctly until after postinstall and so was always zero here, triggering a warning in 4.15. Fix both by moving the enable to the power management resume path, where we know there was a previous disable invocation during suspend. Fixes: 253696ccd613 ("drm/vc4: Account for interrupts in flight") Signed-off-by: Stefan Schake Signed-off-by: Eric Anholt Link: https://patchwork.freedesktop.org/patch/msgid/1514563543-32511-1-git-send-email-stschake@gmail.com Tested-by: Stefan Wahren Reviewed-by: Eric Anholt Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/vc4/vc4_irq.c | 3 --- drivers/gpu/drm/vc4/vc4_v3d.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c index d45a7c0a7915b..d96c084d3a76d 100644 --- a/drivers/gpu/drm/vc4/vc4_irq.c +++ b/drivers/gpu/drm/vc4/vc4_irq.c @@ -208,9 +208,6 @@ vc4_irq_postinstall(struct drm_device *dev) { struct vc4_dev *vc4 = to_vc4_dev(dev); - /* Undo the effects of a previous vc4_irq_uninstall. */ - enable_irq(dev->irq); - /* Enable both the render done and out of memory interrupts. */ V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS); diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c index 7cc346ad9b0ba..ce7c21d250cf9 100644 --- a/drivers/gpu/drm/vc4/vc4_v3d.c +++ b/drivers/gpu/drm/vc4/vc4_v3d.c @@ -173,6 +173,9 @@ static int vc4_v3d_runtime_resume(struct device *dev) struct vc4_dev *vc4 = v3d->vc4; vc4_v3d_init_hw(vc4->dev); + + /* We disabled the IRQ as part of vc4_irq_uninstall in suspend. */ + enable_irq(vc4->dev->irq); vc4_irq_postinstall(vc4->dev); return 0; From 383e0620b70bc654619c12faf3fd3b2384a42a4c Mon Sep 17 00:00:00 2001 From: Dmitry Eremin Date: Thu, 25 Jan 2018 16:51:04 +0300 Subject: [PATCH 139/247] staging: lustre: separate a connection destroy from free struct kib_conn commit 9b046013e5837f8a58453d1e9f8e01d03adb7fe7 upstream. The logic of the original commit 4d99b2581eff ("staging: lustre: avoid intensive reconnecting for ko2iblnd") was assumed conditional free of struct kib_conn if the second argument free_conn in function kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn) is true. But this hunk of code was dropped from original commit. As result the logic works wrong and current code use struct kib_conn after free. > drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c > 3317 kiblnd_destroy_conn(conn, !peer); > ^^^^ Freed always (but should be conditionally) > 3318 > 3319 spin_lock_irqsave(lock, flags); > 3320 if (!peer) > 3321 continue; > 3322 > 3323 conn->ibc_peer = peer; > ^^^^^^^^^^^^^^ Use after free > 3324 if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) > 3325 list_add_tail(&conn->ibc_list, > ^^^^^^^^^^^^^^ Use after free > 3326 &kiblnd_data.kib_reconn_list); > 3327 else > 3328 list_add_tail(&conn->ibc_list, > ^^^^^^^^^^^^^^ Use after free > 3329 &kiblnd_data.kib_reconn_wait); To avoid confusion this fix moved the freeing a struct kib_conn outside of the function kiblnd_destroy_conn() and free as it was intended in original commit. Fixes: 4d99b2581eff ("staging: lustre: avoid intensive reconnecting for ko2iblnd") Signed-off-by: Dmitry Eremin Reviewed-by: Andreas Dilger Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c | 7 +++---- drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h | 2 +- drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 6 ++++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c index 9e88021814528..e8d9db4d8179b 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -824,14 +824,15 @@ struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cm return conn; failed_2: - kiblnd_destroy_conn(conn, true); + kiblnd_destroy_conn(conn); + LIBCFS_FREE(conn, sizeof(*conn)); failed_1: LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); failed_0: return NULL; } -void kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn) +void kiblnd_destroy_conn(struct kib_conn *conn) { struct rdma_cm_id *cmid = conn->ibc_cmid; struct kib_peer *peer = conn->ibc_peer; @@ -894,8 +895,6 @@ void kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn) rdma_destroy_id(cmid); atomic_dec(&net->ibn_nconns); } - - LIBCFS_FREE(conn, sizeof(*conn)); } int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why) diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h index 14576977200f8..30cb2f5b3c159 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h @@ -1018,7 +1018,7 @@ int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why); struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid, int state, int version); -void kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn); +void kiblnd_destroy_conn(struct kib_conn *conn); void kiblnd_close_conn(struct kib_conn *conn, int error); void kiblnd_close_conn_locked(struct kib_conn *conn, int error); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 995f2dac7f264..ea9a0c21d29d3 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3323,11 +3323,13 @@ kiblnd_connd(void *arg) spin_unlock_irqrestore(lock, flags); dropped_lock = 1; - kiblnd_destroy_conn(conn, !peer); + kiblnd_destroy_conn(conn); spin_lock_irqsave(lock, flags); - if (!peer) + if (!peer) { + kfree(conn); continue; + } conn->ibc_peer = peer; if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) From 55eaecffe3d663d02084023b9fc06d5f39b97389 Mon Sep 17 00:00:00 2001 From: Gaurav Kohli Date: Tue, 23 Jan 2018 13:16:34 +0530 Subject: [PATCH 140/247] tty: fix data race between tty_init_dev and flush of buf commit b027e2298bd588d6fa36ed2eda97447fb3eac078 upstream. There can be a race, if receive_buf call comes before tty initialization completes in n_tty_open and tty->disc_data may be NULL. CPU0 CPU1 ---- ---- 000|n_tty_receive_buf_common() n_tty_open() -001|n_tty_receive_buf2() tty_ldisc_open.isra.3() -002|tty_ldisc_receive_buf(inline) tty_ldisc_setup() Using ldisc semaphore lock in tty_init_dev till disc_data initializes completely. Signed-off-by: Gaurav Kohli Reviewed-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 8 +++++++- drivers/tty/tty_ldisc.c | 4 ++-- include/linux/tty.h | 2 ++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 734a635e73632..8d9f9a803b429 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -1543,6 +1543,9 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) "%s: %s driver does not set tty->port. This will crash the kernel later. Fix the driver!\n", __func__, tty->driver->name); + retval = tty_ldisc_lock(tty, 5 * HZ); + if (retval) + goto err_release_lock; tty->port->itty = tty; /* @@ -1553,6 +1556,7 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) retval = tty_ldisc_setup(tty, tty->link); if (retval) goto err_release_tty; + tty_ldisc_unlock(tty); /* Return the tty locked so that it cannot vanish under the caller */ return tty; @@ -1565,9 +1569,11 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) /* call the tty release_tty routine to clean out this slot */ err_release_tty: - tty_unlock(tty); + tty_ldisc_unlock(tty); tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n", retval, idx); +err_release_lock: + tty_unlock(tty); release_tty(tty, idx); return ERR_PTR(retval); } diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index b0500a0a87b86..3a9e2a2fd4c64 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -336,7 +336,7 @@ static inline void __tty_ldisc_unlock(struct tty_struct *tty) ldsem_up_write(&tty->ldisc_sem); } -static int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) +int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) { int ret; @@ -347,7 +347,7 @@ static int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) return 0; } -static void tty_ldisc_unlock(struct tty_struct *tty) +void tty_ldisc_unlock(struct tty_struct *tty) { clear_bit(TTY_LDISC_HALTED, &tty->flags); __tty_ldisc_unlock(tty); diff --git a/include/linux/tty.h b/include/linux/tty.h index 40144f3825163..a41244fe58d02 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -394,6 +394,8 @@ extern struct tty_struct *get_current_tty(void); /* tty_io.c */ extern int __init tty_init(void); extern const char *tty_name(const struct tty_struct *tty); +extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout); +extern void tty_ldisc_unlock(struct tty_struct *tty); #else static inline void console_init(void) { } From 2ef0d2ad5ce80410a1f770cf765d3ce6a9ba45e2 Mon Sep 17 00:00:00 2001 From: OKAMOTO Yoshiaki Date: Tue, 16 Jan 2018 09:51:17 +0000 Subject: [PATCH 141/247] usb: option: Add support for FS040U modem commit 69341bd15018da0a662847e210f9b2380c71e623 upstream. FS040U modem is manufactured by omega, and sold by Fujisoft. This patch adds ID of the modem to use option1 driver. Interface 3 is used as qmi_wwan, so the interface is ignored. Signed-off-by: Yoshiaki Okamoto Signed-off-by: Hiroyuki Yamamoto Acked-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index a818c43a02ecc..1799aa058a5b5 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -383,6 +383,9 @@ static void option_instat_callback(struct urb *urb); #define FOUR_G_SYSTEMS_PRODUCT_W14 0x9603 #define FOUR_G_SYSTEMS_PRODUCT_W100 0x9b01 +/* Fujisoft products */ +#define FUJISOFT_PRODUCT_FS040U 0x9b02 + /* iBall 3.5G connect wireless modem */ #define IBALL_3_5G_CONNECT 0x9605 @@ -1897,6 +1900,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE(LONGCHEER_VENDOR_ID, FOUR_G_SYSTEMS_PRODUCT_W100), .driver_info = (kernel_ulong_t)&four_g_w100_blacklist }, + {USB_DEVICE(LONGCHEER_VENDOR_ID, FUJISOFT_PRODUCT_FS040U), + .driver_info = (kernel_ulong_t)&net_intf3_blacklist}, { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, SPEEDUP_PRODUCT_SU9800, 0xff) }, { USB_DEVICE_INTERFACE_CLASS(LONGCHEER_VENDOR_ID, 0x9801, 0xff), .driver_info = (kernel_ulong_t)&net_intf3_blacklist }, From 068cc4ad2b233eaff1d60552353daee8c047f5e2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 25 Jan 2018 09:48:55 +0100 Subject: [PATCH 142/247] USB: serial: pl2303: new device id for Chilitag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d08dd3f3dd2ae351b793fc5b76abdbf0fd317b12 upstream. This adds a new device id for Chilitag devices to the pl2303 driver. Reported-by: "Chu.Mike [朱堅宜]" Acked-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/pl2303.c | 1 + drivers/usb/serial/pl2303.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c index a51b283798500..3da25ad267a26 100644 --- a/drivers/usb/serial/pl2303.c +++ b/drivers/usb/serial/pl2303.c @@ -39,6 +39,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_RSAQ2) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_DCU11) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_RSAQ3) }, + { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_CHILITAG) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_PHAROS) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_ALDIGA) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_MMX) }, diff --git a/drivers/usb/serial/pl2303.h b/drivers/usb/serial/pl2303.h index 3b5a15d1dc0dd..123289085ee25 100644 --- a/drivers/usb/serial/pl2303.h +++ b/drivers/usb/serial/pl2303.h @@ -17,6 +17,7 @@ #define PL2303_PRODUCT_ID_DCU11 0x1234 #define PL2303_PRODUCT_ID_PHAROS 0xaaa0 #define PL2303_PRODUCT_ID_RSAQ3 0xaaa2 +#define PL2303_PRODUCT_ID_CHILITAG 0xaaa8 #define PL2303_PRODUCT_ID_ALDIGA 0x0611 #define PL2303_PRODUCT_ID_MMX 0x0612 #define PL2303_PRODUCT_ID_GPRS 0x0609 From c3b1f3137751ca44d90a416f741794465641b522 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 14 Jan 2018 16:09:00 +0100 Subject: [PATCH 143/247] USB: cdc-acm: Do not log urb submission errors on disconnect commit f0386c083c2ce85284dc0b419d7b89c8e567c09f upstream. When disconnected sometimes the cdc-acm driver logs errors like these: [20278.039417] cdc_acm 2-2:2.1: urb 9 failed submission with -19 [20278.042924] cdc_acm 2-2:2.1: urb 10 failed submission with -19 [20278.046449] cdc_acm 2-2:2.1: urb 11 failed submission with -19 [20278.049920] cdc_acm 2-2:2.1: urb 12 failed submission with -19 [20278.053442] cdc_acm 2-2:2.1: urb 13 failed submission with -19 [20278.056915] cdc_acm 2-2:2.1: urb 14 failed submission with -19 [20278.060418] cdc_acm 2-2:2.1: urb 15 failed submission with -19 Silence these by not logging errors when the result is -ENODEV. Signed-off-by: Hans de Goede Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index ea20b2cc189fe..ec8fb55d85f86 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -375,7 +375,7 @@ static int acm_submit_read_urb(struct acm *acm, int index, gfp_t mem_flags) res = usb_submit_urb(acm->read_urbs[index], mem_flags); if (res) { - if (res != -EPERM) { + if (res != -EPERM && res != -ENODEV) { dev_err(&acm->data->dev, "urb %d failed submission with %d\n", index, res); From aa6a93fd0c382cda250876825c981ed270059e50 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 18 Jan 2018 12:13:45 +0100 Subject: [PATCH 144/247] CDC-ACM: apply quirk for card reader commit df1cc78a52491f71d8170d513d0f6f114faa1bda upstream. This devices drops random bytes from messages if you talk to it too fast. Signed-off-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index ec8fb55d85f86..34d23cc99fbd6 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1706,6 +1706,9 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x0ace, 0x1611), /* ZyDAS 56K USB MODEM - new version */ .driver_info = SINGLE_RX_URB, /* firmware bug */ }, + { USB_DEVICE(0x11ca, 0x0201), /* VeriFone Mx870 Gadget Serial */ + .driver_info = SINGLE_RX_URB, + }, { USB_DEVICE(0x22b8, 0x7000), /* Motorola Q Phone */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, From ec719c52af164f2eaff0e5bff3b6074ded70c9bf Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 13 Dec 2017 20:34:36 +0800 Subject: [PATCH 145/247] USB: serial: io_edgeport: fix possible sleep-in-atomic commit c7b8f77872c73f69a16528a9eb87afefcccdc18b upstream. According to drivers/usb/serial/io_edgeport.c, the driver may sleep under a spinlock. The function call path is: edge_bulk_in_callback (acquire the spinlock) process_rcvd_data process_rcvd_status change_port_settings send_iosp_ext_cmd write_cmd_usb usb_kill_urb --> may sleep To fix it, the redundant usb_kill_urb() is removed from the error path after usb_submit_urb() fails. This possible bug is found by my static analysis tool (DSAC) and checked by my code review. Signed-off-by: Jia-Ju Bai Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/io_edgeport.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c index 464db17b53285..de61271f2ba3a 100644 --- a/drivers/usb/serial/io_edgeport.c +++ b/drivers/usb/serial/io_edgeport.c @@ -2215,7 +2215,6 @@ static int write_cmd_usb(struct edgeport_port *edge_port, /* something went wrong */ dev_err(dev, "%s - usb_submit_urb(write command) failed, status = %d\n", __func__, status); - usb_kill_urb(urb); usb_free_urb(urb); atomic_dec(&CmdUrbs); return status; From 4c6fcc3425e19d1f63fd5a7f4f4408ce8f688a48 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 17 Jan 2018 12:07:30 -0700 Subject: [PATCH 146/247] usbip: prevent bind loops on devices attached to vhci_hcd commit ef54cf0c600fb8f5737fb001a9e357edda1a1de8 upstream. usbip host binds to devices attached to vhci_hcd on the same server when user does attach over localhost or specifies the server as the remote. usbip attach -r localhost -b busid or usbip attach -r servername (or server IP) Unbind followed by bind works, however device is left in a bad state with accesses via the attached busid result in errors and system hangs during shutdown. Fix it to check and bail out if the device is already attached to vhci_hcd. Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/src/usbip_bind.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/usb/usbip/src/usbip_bind.c b/tools/usb/usbip/src/usbip_bind.c index fa46141ae68bb..e121cfb1746a4 100644 --- a/tools/usb/usbip/src/usbip_bind.c +++ b/tools/usb/usbip/src/usbip_bind.c @@ -144,6 +144,7 @@ static int bind_device(char *busid) int rc; struct udev *udev; struct udev_device *dev; + const char *devpath; /* Check whether the device with this bus ID exists. */ udev = udev_new(); @@ -152,8 +153,16 @@ static int bind_device(char *busid) err("device with the specified bus ID does not exist"); return -1; } + devpath = udev_device_get_devpath(dev); udev_unref(udev); + /* If the device is already attached to vhci_hcd - bail out */ + if (strstr(devpath, USBIP_VHCI_DRV_NAME)) { + err("bind loop detected: device: %s is attached to %s\n", + devpath, USBIP_VHCI_DRV_NAME); + return -1; + } + rc = unbind_other(busid); if (rc == UNBIND_ST_FAILED) { err("could not unbind driver from device on busid %s", busid); From f80079536bb6b2c733776abb19c1f3e8c48923b6 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 17 Jan 2018 12:08:03 -0700 Subject: [PATCH 147/247] usbip: list: don't list devices attached to vhci_hcd commit ef824501f50846589f02173d73ce3fe6021a9d2a upstream. usbip host lists devices attached to vhci_hcd on the same server when user does attach over localhost or specifies the server as the remote. usbip attach -r localhost -b busid or usbip attach -r servername (or server IP) Fix it to check and not list devices that are attached to vhci_hcd. Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/src/usbip_list.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/usb/usbip/src/usbip_list.c b/tools/usb/usbip/src/usbip_list.c index f1b38e866dd7e..d65a9f4441744 100644 --- a/tools/usb/usbip/src/usbip_list.c +++ b/tools/usb/usbip/src/usbip_list.c @@ -187,6 +187,7 @@ static int list_devices(bool parsable) const char *busid; char product_name[128]; int ret = -1; + const char *devpath; /* Create libudev context. */ udev = udev_new(); @@ -209,6 +210,14 @@ static int list_devices(bool parsable) path = udev_list_entry_get_name(dev_list_entry); dev = udev_device_new_from_syspath(udev, path); + /* Ignore devices attached to vhci_hcd */ + devpath = udev_device_get_devpath(dev); + if (strstr(devpath, USBIP_VHCI_DRV_NAME)) { + dbg("Skip the device %s already attached to %s\n", + devpath, USBIP_VHCI_DRV_NAME); + continue; + } + /* Get device information. */ idVendor = udev_device_get_sysattr_value(dev, "idVendor"); idProduct = udev_device_get_sysattr_value(dev, "idProduct"); From 800de0fab17ac3e8656c86214d23299fedbf6ca4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 18 Jan 2018 14:46:41 +1100 Subject: [PATCH 148/247] USB: serial: simple: add Motorola Tetra driver commit 46fe895e22ab3845515ec06b01eaf1282b342e29 upstream. Add new Motorola Tetra (simple) driver for Motorola Solutions TETRA PEI devices. D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=0cad ProdID=9011 Rev=24.16 S: Manufacturer=Motorola Solutions Inc. S: Product=Motorola Solutions TETRA PEI interface C: #Ifs= 2 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) Note that these devices do not support the CDC SET_CONTROL_LINE_STATE request (for any interface). Reported-by: Max Schulze Tested-by: Max Schulze Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/Kconfig | 1 + drivers/usb/serial/usb-serial-simple.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig index 56ecb8b5115dd..584ae8cbaf1ce 100644 --- a/drivers/usb/serial/Kconfig +++ b/drivers/usb/serial/Kconfig @@ -63,6 +63,7 @@ config USB_SERIAL_SIMPLE - Google USB serial devices - HP4x calculators - a number of Motorola phones + - Motorola Tetra devices - Novatel Wireless GPS receivers - Siemens USB/MPI adapter. - ViVOtech ViVOpay USB device. diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index e98b6e57b703d..6aa7ff2c1cf74 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -80,6 +80,11 @@ DEVICE(vivopay, VIVOPAY_IDS); { USB_DEVICE(0x22b8, 0x2c64) } /* Motorola V950 phone */ DEVICE(moto_modem, MOTO_IDS); +/* Motorola Tetra driver */ +#define MOTOROLA_TETRA_IDS() \ + { USB_DEVICE(0x0cad, 0x9011) } /* Motorola Solutions TETRA PEI */ +DEVICE(motorola_tetra, MOTOROLA_TETRA_IDS); + /* Novatel Wireless GPS driver */ #define NOVATEL_IDS() \ { USB_DEVICE(0x09d7, 0x0100) } /* NovAtel FlexPack GPS */ @@ -110,6 +115,7 @@ static struct usb_serial_driver * const serial_drivers[] = { &google_device, &vivopay_device, &moto_modem_device, + &motorola_tetra_device, &novatel_gps_device, &hp4x_device, &suunto_device, @@ -125,6 +131,7 @@ static const struct usb_device_id id_table[] = { GOOGLE_IDS(), VIVOPAY_IDS(), MOTO_IDS(), + MOTOROLA_TETRA_IDS(), NOVATEL_IDS(), HP4X_IDS(), SUUNTO_IDS(), From f24d171a8100fb50a20fc6bb8c750fe9c452b9dc Mon Sep 17 00:00:00 2001 From: Hemant Kumar Date: Tue, 9 Jan 2018 12:30:53 +0530 Subject: [PATCH 149/247] usb: f_fs: Prevent gadget unbind if it is already unbound commit ce5bf9a50daf2d9078b505aca1cea22e88ecb94a upstream. Upon usb composition switch there is possibility of ep0 file release happening after gadget driver bind. In case of composition switch from adb to a non-adb composition gadget will never gets bound again resulting into failure of usb device enumeration. Fix this issue by checking FFS_FL_BOUND flag and avoid extra gadget driver unbind if it is already done as part of composition switch. This fixes adb reconnection error reported on Android running v4.4 and above kernel versions. Verified on Hikey running vanilla v4.15-rc7 + few out of tree Mali patches. Reviewed-at: https://android-review.googlesource.com/#/c/582632/ Cc: Felipe Balbi Cc: Greg KH Cc: Michal Nazarewicz Cc: John Stultz Cc: Dmitry Shmidt Cc: Badhri Cc: Android Kernel Team Signed-off-by: Hemant Kumar [AmitP: Cherry-picked it from android-4.14 and updated the commit log] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_fs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 7b107e43b1c4e..d90bf57ba30ee 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -3725,7 +3725,8 @@ static void ffs_closed(struct ffs_data *ffs) ci = opts->func_inst.group.cg_item.ci_parent->ci_parent; ffs_dev_unlock(); - unregister_gadget_item(ci); + if (test_bit(FFS_FL_BOUND, &ffs->flags)) + unregister_gadget_item(ci); return; done: ffs_dev_unlock(); From 92e64a1079fac557ebb48ce266d23070924109ef Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 11 Jan 2018 13:10:16 +0100 Subject: [PATCH 150/247] usb: uas: unconditionally bring back host after reset commit cbeef22fd611c4f47c494b821b2b105b8af970bb upstream. Quoting Hans: If we return 1 from our post_reset handler, then our disconnect handler will be called immediately afterwards. Since pre_reset blocks all scsi requests our disconnect handler will then hang in the scsi_remove_host call. This is esp. bad because our disconnect handler hanging for ever also stops the USB subsys from enumerating any new USB devices, causes commands like lsusb to hang, etc. In practice this happens when unplugging some uas devices because the hub code may see the device as needing a warm-reset and calls usb_reset_device before seeing the disconnect. In this case uas_configure_endpoints fails with -ENODEV. We do not want to print an error for this, so this commit also silences the shost_printk for -ENODEV. ENDQUOTE However, if we do that we better drop any unconditional execution and report to the SCSI subsystem that we have undergone a reset but we are not operational now. Signed-off-by: Oliver Neukum Reported-by: Hans de Goede Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/uas.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 9876af4ab64e0..6891e90927757 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -1076,20 +1076,19 @@ static int uas_post_reset(struct usb_interface *intf) return 0; err = uas_configure_endpoints(devinfo); - if (err) { + if (err && err != ENODEV) shost_printk(KERN_ERR, shost, "%s: alloc streams error %d after reset", __func__, err); - return 1; - } + /* we must unblock the host in every case lest we deadlock */ spin_lock_irqsave(shost->host_lock, flags); scsi_report_bus_reset(shost, 0); spin_unlock_irqrestore(shost->host_lock, flags); scsi_unblock_requests(shost); - return 0; + return err ? 1 : 0; } static int uas_suspend(struct usb_interface *intf, pm_message_t message) From 57d4bb1beecb1b34237ddc8a08e819a0da4243b1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 17:50:02 +1100 Subject: [PATCH 151/247] usb/gadget: Fix "high bandwidth" check in usb_gadget_ep_match_desc() commit 11fb37998759c48e4e4c200c974593cbeab25d3e upstream. The current code tries to test for bits that are masked out by usb_endpoint_maxp(). Instead, use the proper accessor to access the new high bandwidth bits. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d685d82dcf487..e97539fc127e6 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -913,7 +913,7 @@ int usb_gadget_ep_match_desc(struct usb_gadget *gadget, return 0; /* "high bandwidth" works only at high speed */ - if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp(desc) & (3<<11)) + if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp_mult(desc) > 1) return 0; switch (type) { From 9df847674ede722e12094def6520fd2ff98452df Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 26 Jan 2018 11:54:35 -0700 Subject: [PATCH 152/247] usbip: vhci_hcd: clear just the USB_PORT_STAT_POWER bit Upstream commit 1c9de5bf4286 ("usbip: vhci-hcd: Add USB3 SuperSpeed support") vhci_hcd clears all the bits port_status bits instead of clearing just the USB_PORT_STAT_POWER bit when it handles ClearPortFeature: USB_PORT_FEAT_POWER. This causes vhci_hcd attach to fail in a bad state, leaving device unusable by the client. The device is still attached and however client can't use it. The problem was fixed as part of larger change to add USB3 Super Speed support. This patch backports just the change to clear the USB_PORT_STAT_POWER. Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vhci_hcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 7f161b0951768..dbe615ba07c95 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -300,7 +300,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, case USB_PORT_FEAT_POWER: usbip_dbg_vhci_rh( " ClearPortFeature: USB_PORT_FEAT_POWER\n"); - dum->port_status[rhport] = 0; + dum->port_status[rhport] &= ~USB_PORT_STAT_POWER; dum->resuming = 0; break; case USB_PORT_FEAT_C_RESET: From 5846849a1ac75c4a4898f09f3835ac737bb888d5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 4 Jan 2018 15:58:34 -0200 Subject: [PATCH 153/247] serial: imx: Only wakeup via RTSDEN bit if the system has RTS/CTS commit 38b1f0fb42f772b8c9aac53593883a18ff5eb9d7 upstream. The wakeup mechanism via RTSDEN bit relies on the system using the RTS/CTS lines, so only allow such wakeup method when the system actually has RTS/CTS support. Fixes: bc85734b126f ("serial: imx: allow waking up on RTSD") Signed-off-by: Fabio Estevam Reviewed-by: Martin Kaiser Acked-by: Fugang Duan Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index a70356dad1b75..521a6e4507559 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -2239,12 +2239,14 @@ static void serial_imx_enable_wakeup(struct imx_port *sport, bool on) val &= ~UCR3_AWAKEN; writel(val, sport->port.membase + UCR3); - val = readl(sport->port.membase + UCR1); - if (on) - val |= UCR1_RTSDEN; - else - val &= ~UCR1_RTSDEN; - writel(val, sport->port.membase + UCR1); + if (sport->have_rtscts) { + val = readl(sport->port.membase + UCR1); + if (on) + val |= UCR1_RTSDEN; + else + val &= ~UCR1_RTSDEN; + writel(val, sport->port.membase + UCR1); + } } static int imx_serial_port_suspend_noirq(struct device *dev) From 1333c3e996eb799286ee2ef2c01752da45bf926f Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sun, 7 Jan 2018 15:05:49 +0100 Subject: [PATCH 154/247] spi: imx: do not access registers while clocks disabled commit d593574aff0ab846136190b1729c151c736727ec upstream. Since clocks are disabled except during message transfer clocks are also disabled when spi_imx_remove gets called. Accessing registers leads to a freeeze at least on a i.MX 6ULL. Enable clocks before disabling accessing the MXC_CSPICTRL register. Fixes: 9e556dcc55774 ("spi: spi-imx: only enable the clocks when we start to transfer a message") Signed-off-by: Stefan Agner Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-imx.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index deb782f6556c1..a6e34f05d44d4 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1307,12 +1307,23 @@ static int spi_imx_remove(struct platform_device *pdev) { struct spi_master *master = platform_get_drvdata(pdev); struct spi_imx_data *spi_imx = spi_master_get_devdata(master); + int ret; spi_bitbang_stop(&spi_imx->bitbang); + ret = clk_enable(spi_imx->clk_per); + if (ret) + return ret; + + ret = clk_enable(spi_imx->clk_ipg); + if (ret) { + clk_disable(spi_imx->clk_per); + return ret; + } + writel(0, spi_imx->base + MXC_CSPICTRL); - clk_unprepare(spi_imx->clk_ipg); - clk_unprepare(spi_imx->clk_per); + clk_disable_unprepare(spi_imx->clk_ipg); + clk_disable_unprepare(spi_imx->clk_per); spi_imx_sdma_exit(spi_imx); spi_master_put(master); From 331b057d4f3ccf2290e6e651b5728db81e9249c6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 3 Feb 2018 17:05:43 +0100 Subject: [PATCH 155/247] Linux 4.9.80 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4a7e6dff1c2e7..9550b6939076d 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 79 +SUBLEVEL = 80 EXTRAVERSION = NAME = Roaring Lionus From be6641a7e6f79b69446e7f1c44bab75bf20f1665 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 9 Jan 2018 03:52:05 +1100 Subject: [PATCH 156/247] powerpc/pseries: Add H_GET_CPU_CHARACTERISTICS flags & wrapper commit 191eccb1580939fb0d47deb405b82a85b0379070 upstream. A new hypervisor call has been defined to communicate various characteristics of the CPU to guests. Add definitions for the hcall number, flags and a wrapper function. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman [Balbir fixed conflicts in backport] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/hvcall.h | 17 +++++++++++++++++ arch/powerpc/include/asm/plpar_wrappers.h | 14 ++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 708edebcf1474..0e12cb2437d1b 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -240,6 +240,7 @@ #define H_GET_HCA_INFO 0x1B8 #define H_GET_PERF_COUNT 0x1BC #define H_MANAGE_TRACE 0x1C0 +#define H_GET_CPU_CHARACTERISTICS 0x1C8 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4 #define H_QUERY_INT_STATE 0x1E4 #define H_POLL_PENDING 0x1D8 @@ -306,6 +307,17 @@ #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE 3 #define H_SET_MODE_RESOURCE_LE 4 +/* H_GET_CPU_CHARACTERISTICS return values */ +#define H_CPU_CHAR_SPEC_BAR_ORI31 (1ull << 63) // IBM bit 0 +#define H_CPU_CHAR_BCCTRL_SERIALISED (1ull << 62) // IBM bit 1 +#define H_CPU_CHAR_L1D_FLUSH_ORI30 (1ull << 61) // IBM bit 2 +#define H_CPU_CHAR_L1D_FLUSH_TRIG2 (1ull << 60) // IBM bit 3 +#define H_CPU_CHAR_L1D_THREAD_PRIV (1ull << 59) // IBM bit 4 + +#define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0 +#define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1 +#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2 + #ifndef __ASSEMBLY__ /** @@ -433,6 +445,11 @@ static inline unsigned long cmo_get_page_size(void) } #endif /* CONFIG_PPC_PSERIES */ +struct h_cpu_char_result { + u64 character; + u64 behaviour; +}; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 1b394247afc2d..4e53b8570d1f1 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -340,4 +340,18 @@ static inline long plapr_set_watchpoint0(unsigned long dawr0, unsigned long dawr return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0); } +static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf); + if (rc == H_SUCCESS) { + p->character = retbuf[0]; + p->behaviour = retbuf[1]; + } + + return rc; +} + #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ From 8fd3f98d0f4d5a9044b900de950ab02164968d27 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 157/247] powerpc/64: Add macros for annotating the destination of rfid/hrfid commit 50e51c13b3822d14ff6df4279423e4b7b2269bc3 upstream. The rfid/hrfid ((Hypervisor) Return From Interrupt) instruction is used for switching from the kernel to userspace, and from the hypervisor to the guest kernel. However it can and is also used for other transitions, eg. from real mode kernel code to virtual mode kernel code, and it's not always clear from the code what the destination context is. To make it clearer when reading the code, add macros which encode the expected destination context. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/exception-64e.h | 6 +++++ arch/powerpc/include/asm/exception-64s.h | 29 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index a703452d67b62..555e22d5e07f9 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -209,5 +209,11 @@ exc_##label##_book3e: ori r3,r3,vector_offset@l; \ mtspr SPRN_IVOR##vector_number,r3; +#define RFI_TO_KERNEL \ + rfi + +#define RFI_TO_USER \ + rfi + #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a3eee6612976..6771cbe445941 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -51,6 +51,35 @@ #define EX_PPR 88 /* SMT thread status register (priority) */ #define EX_CTR 96 +/* Macros for annotating the expected destination of (h)rfid */ + +#define RFI_TO_KERNEL \ + rfid + +#define RFI_TO_USER \ + rfid + +#define RFI_TO_USER_OR_KERNEL \ + rfid + +#define RFI_TO_GUEST \ + rfid + +#define HRFI_TO_KERNEL \ + hrfid + +#define HRFI_TO_USER \ + hrfid + +#define HRFI_TO_USER_OR_KERNEL \ + hrfid + +#define HRFI_TO_GUEST \ + hrfid + +#define HRFI_TO_UNKNOWN \ + hrfid + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ From 9d914324d966497f4d40cfa9333cbe55150cc09b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 158/247] powerpc/64: Convert fast_exception_return to use RFI_TO_USER/KERNEL commit a08f828cf47e6c605af21d2cdec68f84e799c318 upstream. Similar to the syscall return path, in fast_exception_return we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/entry_64.S | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index caa6596715990..d267d3bcde90e 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -859,7 +859,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) -1: + mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -872,8 +872,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r3,GPR3(r1) ld r4,GPR4(r1) ld r1,GPR1(r1) + RFI_TO_USER + b . /* prevent speculative execution */ - rfid +1: mtspr SPRN_SRR1,r3 + + ld r2,_CCR(r1) + mtcrf 0xFF,r2 + ld r2,_NIP(r1) + mtspr SPRN_SRR0,r2 + + ld r0,GPR0(r1) + ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r4,GPR4(r1) + ld r1,GPR1(r1) + RFI_TO_KERNEL b . /* prevent speculative execution */ #endif /* CONFIG_PPC_BOOK3E */ From 00e40620a51ebee4ea002ec2efcd64f1960cb964 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 159/247] powerpc/64: Convert the syscall exit path to use RFI_TO_USER/KERNEL commit b8e90cb7bc04a509e821e82ab6ed7a8ef11ba333 upstream. In the syscall exit path we may be returning to user or kernel context. We already have a test for that, because we conditionally restore r13. So use that existing test and branch, and bifurcate the return based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/entry_64.S | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index d267d3bcde90e..c33b69d109198 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -251,13 +251,23 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ + ld r2,GPR2(r1) + ld r1,GPR1(r1) + mtlr r4 + mtcr r5 + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r8 + RFI_TO_USER + b . /* prevent speculative execution */ + + /* exit to kernel */ 1: ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ syscall_error: From 48cc95d4e4d6a1265b7f728182d6dc62de849b05 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 160/247] powerpc/64s: Convert slb_miss_common to use RFI_TO_USER/KERNEL commit c7305645eb0c1621351cfc104038831ae87c0053 upstream. In the SLB miss handler we may be returning to user or kernel. We need to add a check early on and save the result in the cr4 register, and then we bifurcate the return path based on that. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Signed-off-by: Nicholas Piggin [mpe: Backport to 4.4 based on patch from Balbir] Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/exceptions-64s.S | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fd68e19b9ef7a..fc72f81411c4e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -655,6 +655,8 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) andi. r10,r12,MSR_RI /* check for unrecoverable exception */ beq- 2f + andi. r10,r12,MSR_PR /* check for user mode (PR != 0) */ + bne 1f /* All done -- return from exception. */ @@ -671,7 +673,23 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) ld r11,PACA_EXSLB+EX_R11(r13) ld r12,PACA_EXSLB+EX_R12(r13) ld r13,PACA_EXSLB+EX_R13(r13) - rfid + RFI_TO_KERNEL + b . /* prevent speculative execution */ + +1: +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_PPR_PACA(PACA_EXSLB, r9) + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_USER b . /* prevent speculative execution */ 2: mfspr r11,SPRN_SRR0 @@ -679,7 +697,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 8: mfspr r11,SPRN_SRR0 From c3b82ebee6e0d92431c92ee80393c023d550c8a1 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 161/247] powerpc/64s: Add support for RFI flush of L1-D cache commit aa8a5e0062ac940f7659394f4817c948dc8c0667 upstream. On some CPUs we can prevent the Meltdown vulnerability by flushing the L1-D cache on exit from kernel to user mode, and from hypervisor to guest. This is known to be the case on at least Power7, Power8 and Power9. At this time we do not know the status of the vulnerability on other CPUs such as the 970 (Apple G5), pasemi CPUs (AmigaOne X1000) or Freescale CPUs. As more information comes to light we can enable this, or other mechanisms on those CPUs. The vulnerability occurs when the load of an architecturally inaccessible memory region (eg. userspace load of kernel memory) is speculatively executed to the point where its result can influence the address of a subsequent speculatively executed load. In order for that to happen, the first load must hit in the L1, because before the load is sent to the L2 the permission check is performed. Therefore if no kernel addresses hit in the L1 the vulnerability can not occur. We can ensure that is the case by flushing the L1 whenever we return to userspace. Similarly for hypervisor vs guest. In order to flush the L1-D cache on exit, we add a section of nops at each (h)rfi location that returns to a lower privileged context, and patch that with some sequence. Newer firmwares are able to advertise to us that there is a special nop instruction that flushes the L1-D. If we do not see that advertised, we fall back to doing a displacement flush in software. For guest kernels we support migration between some CPU versions, and different CPUs may use different flush instructions. So that we are prepared to migrate to a machine with a different flush instruction activated, we may have to patch more than one flush instruction at boot if the hypervisor tells us to. In the end this patch is mostly the work of Nicholas Piggin and Michael Ellerman. However a cast of thousands contributed to analysis of the issue, earlier versions of the patch, back ports testing etc. Many thanks to all of them. Tested-by: Jon Masters Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman [Balbir - back ported to stable with changes] Signed-off-by: Balbir Singh Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/exception-64s.h | 40 ++++++++--- arch/powerpc/include/asm/feature-fixups.h | 15 ++++ arch/powerpc/include/asm/paca.h | 10 +++ arch/powerpc/include/asm/setup.h | 13 ++++ arch/powerpc/kernel/asm-offsets.c | 4 ++ arch/powerpc/kernel/exceptions-64s.S | 86 +++++++++++++++++++++++ arch/powerpc/kernel/setup_64.c | 79 +++++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 9 +++ arch/powerpc/lib/feature-fixups.c | 42 +++++++++++ 9 files changed, 290 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 6771cbe445941..cab6d2a46c415 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -51,34 +51,58 @@ #define EX_PPR 88 /* SMT thread status register (priority) */ #define EX_CTR 96 -/* Macros for annotating the expected destination of (h)rfid */ +/* + * Macros for annotating the expected destination of (h)rfid + * + * The nop instructions allow us to insert one or more instructions to flush the + * L1-D cache when returning to userspace or a guest. + */ +#define RFI_FLUSH_SLOT \ + RFI_FLUSH_FIXUP_SECTION; \ + nop; \ + nop; \ + nop #define RFI_TO_KERNEL \ rfid #define RFI_TO_USER \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_USER_OR_KERNEL \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define RFI_TO_GUEST \ - rfid + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback #define HRFI_TO_KERNEL \ hrfid #define HRFI_TO_USER \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_USER_OR_KERNEL \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_GUEST \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #define HRFI_TO_UNKNOWN \ - hrfid + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index ddf54f5bbdd1c..7b332342071c2 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -189,4 +189,19 @@ void apply_feature_fixups(void); void setup_feature_keys(void); #endif +#define RFI_FLUSH_FIXUP_SECTION \ +951: \ + .pushsection __rfi_flush_fixup,"a"; \ + .align 2; \ +952: \ + FTR_ENTRY_OFFSET 951b-952b; \ + .popsection; + + +#ifndef __ASSEMBLY__ + +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; + +#endif + #endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6a6792bb39fbc..ea43897183fda 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -205,6 +205,16 @@ struct paca_struct { struct sibling_subcore_state *sibling_subcore_state; #endif #endif +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * rfi fallback flush must be in its own cacheline to prevent + * other paca data leaking into the L1d + */ + u64 exrfi[13] __aligned(0x80); + void *rfi_flush_fallback_area; + u64 l1d_flush_congruence; + u64 l1d_flush_sets; +#endif }; #ifdef CONFIG_PPC_BOOK3S diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 654d64c9f3acd..6825a67cc3db3 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -38,6 +38,19 @@ static inline void pseries_big_endian_exceptions(void) {} static inline void pseries_little_endian_exceptions(void) {} #endif /* CONFIG_PPC_PSERIES */ +void rfi_flush_enable(bool enable); + +/* These are bit flags */ +enum l1d_flush_type { + L1D_FLUSH_NONE = 0x1, + L1D_FLUSH_FALLBACK = 0x2, + L1D_FLUSH_ORI = 0x4, + L1D_FLUSH_MTTRIG = 0x8, +}; + +void __init setup_rfi_flush(enum l1d_flush_type, bool enable); +void do_rfi_flush_fixups(enum l1d_flush_type types); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c833d88c423d8..64bcbd5804950 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -240,6 +240,10 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp)); DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce)); + DEFINE(PACA_RFI_FLUSH_FALLBACK_AREA, offsetof(struct paca_struct, rfi_flush_fallback_area)); + DEFINE(PACA_EXRFI, offsetof(struct paca_struct, exrfi)); + DEFINE(PACA_L1D_FLUSH_CONGRUENCE, offsetof(struct paca_struct, l1d_flush_congruence)); + DEFINE(PACA_L1D_FLUSH_SETS, offsetof(struct paca_struct, l1d_flush_sets)); #endif DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fc72f81411c4e..96db6c3adebe0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1594,6 +1594,92 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) bl kernel_bad_stack b 1b + .globl rfi_flush_fallback +rfi_flush_fallback: + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + rfid + + .globl hrfi_flush_fallback +hrfi_flush_fallback: + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + hrfid + /* * Called from arch_local_irq_enable when an interrupt needs * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a12be60181bf8..849d086288c6c 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -678,4 +678,83 @@ static int __init disable_hardlockup_detector(void) return 0; } early_initcall(disable_hardlockup_detector); + +#ifdef CONFIG_PPC_BOOK3S_64 +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +bool rfi_flush; + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (rfi_flush == enable) + return; + + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +static void init_fallback_flush(void) +{ + u64 l1d_size, limit; + int cpu; + + l1d_size = ppc64_caches.dsize; + limit = min(safe_stack_limit(), ppc64_rma_size); + + /* + * Align to L1d size, and size it at 2x L1d size, to catch possible + * hardware prefetch runoff. We don't have a recipe for load patterns to + * reliably avoid the prefetcher. + */ + l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit)); + memset(l1d_flush_fallback_area, 0, l1d_size * 2); + + for_each_possible_cpu(cpu) { + /* + * The fallback flush is currently coded for 8-way + * associativity. Different associativity is possible, but it + * will be treated as 8-way and may not evict the lines as + * effectively. + * + * 128 byte lines are mandatory. + */ + u64 c = l1d_size / 8; + + paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area; + paca[cpu].l1d_flush_congruence = c; + paca[cpu].l1d_flush_sets = c / 128; + } +} + +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + pr_info("rfi-flush: Using fallback displacement flush\n"); + init_fallback_flush(); + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: Using ori type flush\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: Using mttrig type flush\n"); + + enabled_flush_types = types; + + rfi_flush_enable(enable); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ #endif diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7394b770ae1f6..b61fb7902018e 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -132,6 +132,15 @@ SECTIONS /* Read-only data */ RODATA +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) { + __start___rfi_flush_fixup = .; + *(__rfi_flush_fixup) + __stop___rfi_flush_fixup = .; + } +#endif + EXCEPTION_TABLE(0) NOTES :kernel :notes diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 043415f0bdb16..e86bfa111f3c9 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -23,6 +23,7 @@ #include #include #include +#include struct fixup_entry { unsigned long mask; @@ -115,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } +#ifdef CONFIG_PPC_BOOK3S_64 +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ + unsigned int instrs[3], *dest; + long *start, *end; + int i; + + start = PTRRELOC(&__start___rfi_flush_fixup), + end = PTRRELOC(&__stop___rfi_flush_fixup); + + instrs[0] = 0x60000000; /* nop */ + instrs[1] = 0x60000000; /* nop */ + instrs[2] = 0x60000000; /* nop */ + + if (types & L1D_FLUSH_FALLBACK) + /* b .+16 to fallback flush */ + instrs[0] = 0x48000010; + + i = 0; + if (types & L1D_FLUSH_ORI) { + instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */ + instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + + for (i = 0; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + patch_instruction(dest, instrs[0]); + patch_instruction(dest + 1, instrs[1]); + patch_instruction(dest + 2, instrs[2]); + } + + printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { long *start, *end; From 0ef9f8289edf1d335fb2bd3c162521528823b585 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 162/247] powerpc/64s: Support disabling RFI flush with no_rfi_flush and nopti commit bc9c9304a45480797e13a8e1df96ffcf44fb62fe upstream. Because there may be some performance overhead of the RFI flush, add kernel command line options to disable it. We add a sensibly named 'no_rfi_flush' option, but we also hijack the x86 option 'nopti'. The RFI flush is not the same as KPTI, but if we see 'nopti' we can guess that the user is trying to avoid any overhead of Meltdown mitigations, and it means we don't have to educate every one about a different command line option. Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/setup_64.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 849d086288c6c..e728bb437fbdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -682,8 +682,29 @@ early_initcall(disable_hardlockup_detector); #ifdef CONFIG_PPC_BOOK3S_64 static enum l1d_flush_type enabled_flush_types; static void *l1d_flush_fallback_area; +static bool no_rfi_flush; bool rfi_flush; +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + static void do_nothing(void *unused) { /* @@ -754,7 +775,8 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - rfi_flush_enable(enable); + if (!no_rfi_flush) + rfi_flush_enable(enable); } #endif /* CONFIG_PPC_BOOK3S_64 */ #endif From 7db0fff62f52c3f23c39262b9e037d8b43dfc88d Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 163/247] powerpc/pseries: Query hypervisor for RFI flush settings commit 8989d56878a7735dfdb234707a2fee6faf631085 upstream. A new hypervisor call is available which tells the guest settings related to the RFI flush. Use it to query the appropriate flush instruction(s), and whether the flush is required. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/setup.c | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 97aa3f332f240..1845fc6119120 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -450,6 +450,39 @@ static void __init find_and_init_phbs(void) of_pci_check_probe_only(); } +static void pseries_setup_rfi_flush(void) +{ + struct h_cpu_char_result result; + enum l1d_flush_type types; + bool enable; + long rc; + + /* Enable by default */ + enable = true; + + rc = plpar_get_cpu_characteristics(&result); + if (rc == H_SUCCESS) { + types = L1D_FLUSH_NONE; + + if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2) + types |= L1D_FLUSH_MTTRIG; + if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30) + types |= L1D_FLUSH_ORI; + + /* Use fallback if nothing set in hcall */ + if (types == L1D_FLUSH_NONE) + types = L1D_FLUSH_FALLBACK; + + if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR)) + enable = false; + } else { + /* Default to fallback if case hcall is not available */ + types = L1D_FLUSH_FALLBACK; + } + + setup_rfi_flush(types, enable); +} + static void __init pSeries_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -467,6 +500,8 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + pseries_setup_rfi_flush(); + /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); From 6aec12e1869e31839f317c02f81a92c393222f71 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 10 Jan 2018 03:07:15 +1100 Subject: [PATCH 164/247] powerpc/powernv: Check device-tree for RFI flush settings commit 6e032b350cd1fdb830f18f8320ef0e13b4e24094 upstream. New device-tree properties are available which tell the hypervisor settings related to the RFI flush. Use them to determine the appropriate flush instruction to use, and whether the flush is required. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/powernv/setup.c | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index b33faa0015cca..6f8b4c19373af 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -35,13 +35,63 @@ #include #include #include +#include +#include #include "powernv.h" +static void pnv_setup_rfi_flush(void) +{ + struct device_node *np, *fw_features; + enum l1d_flush_type type; + int enable; + + /* Default to fallback in case fw-features are not available */ + type = L1D_FLUSH_FALLBACK; + enable = 1; + + np = of_find_node_by_name(NULL, "ibm,opal"); + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + + if (fw_features) { + np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_MTTRIG; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_ORI; + + of_node_put(np); + + /* Enable unless firmware says NOT to */ + enable = 2; + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + of_node_put(fw_features); + } + + setup_rfi_flush(type, enable > 0); +} + static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + pnv_setup_rfi_flush(); + /* Initialize SMP */ pnv_smp_init(); From 1f0c936f431d98611fff5ef7082380f087da1578 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 16 Jan 2018 21:20:05 +1100 Subject: [PATCH 165/247] powerpc/64s: Wire up cpu_show_meltdown() commit fd6e440f20b1a4304553775fc55938848ff617c9 upstream. The recent commit 87590ce6e373 ("sysfs/cpu: Add vulnerability folder") added a generic folder and set of files for reporting information on CPU vulnerabilities. One of those was for meltdown: /sys/devices/system/cpu/vulnerabilities/meltdown This commit wires up that file for 64-bit Book3S powerpc. For now we default to "Vulnerable" unless the RFI flush is enabled. That may not actually be true on all hardware, further patches will refine the reporting based on the CPU/platform etc. But for now we default to being pessimists. Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6eda5abbd719e..0a6bb48854e3e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -128,6 +128,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select GENERIC_SMP_IDLE_THREAD select GENERIC_CMOS_UPDATE + select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64 select GENERIC_TIME_VSYSCALL_OLD select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e728bb437fbdb..79136dc981be5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -778,5 +778,13 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) if (!no_rfi_flush) rfi_flush_enable(enable); } + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (rfi_flush) + return sprintf(buf, "Mitigation: RFI Flush\n"); + + return sprintf(buf, "Vulnerable\n"); +} #endif /* CONFIG_PPC_BOOK3S_64 */ #endif From 9fed3978c39b27b27eeb676301b7ba960a74170c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 16 Jan 2018 22:17:18 +1100 Subject: [PATCH 166/247] powerpc/64s: Allow control of RFI flush via debugfs commit 236003e6b5443c45c18e613d2b0d776a9f87540e upstream. Expose the state of the RFI flush (enabled/disabled) via debugfs, and allow it to be enabled/disabled at runtime. eg: $ cat /sys/kernel/debug/powerpc/rfi_flush 1 $ echo 0 > /sys/kernel/debug/powerpc/rfi_flush $ cat /sys/kernel/debug/powerpc/rfi_flush 0 Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/setup_64.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 79136dc981be5..7c30a91c1f868 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -779,6 +780,35 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) rfi_flush_enable(enable); } +#ifdef CONFIG_DEBUG_FS +static int rfi_flush_set(void *data, u64 val) +{ + if (val == 1) + rfi_flush_enable(true); + else if (val == 0) + rfi_flush_enable(false); + else + return -EINVAL; + + return 0; +} + +static int rfi_flush_get(void *data, u64 *val) +{ + *val = rfi_flush ? 1 : 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n"); + +static __init int rfi_flush_debugfs_init(void) +{ + debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); + return 0; +} +device_initcall(rfi_flush_debugfs_init); +#endif + ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { if (rfi_flush) From 781a2d683110864fc136d85df7a2fb3fc145e5e8 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Wed, 10 Jan 2018 17:41:10 +0100 Subject: [PATCH 167/247] auxdisplay: img-ascii-lcd: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit 09c479f7f1fbfaf848e5813996793966cd50be81 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/auxdisplay/img-ascii-lcd.o see include/linux/module.h for more information This adds the license as "GPL", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/auxdisplay/img-ascii-lcd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c index 83f1439e57fd8..6e8eaa7fe7a6f 100644 --- a/drivers/auxdisplay/img-ascii-lcd.c +++ b/drivers/auxdisplay/img-ascii-lcd.c @@ -442,3 +442,7 @@ static struct platform_driver img_ascii_lcd_driver = { .remove = img_ascii_lcd_remove, }; module_platform_driver(img_ascii_lcd_driver); + +MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display"); +MODULE_AUTHOR("Paul Burton "); +MODULE_LICENSE("GPL"); From 0ee4f5e7bbffc07d98cb7626446175700ef5fcce Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 20 Nov 2017 12:58:03 -0800 Subject: [PATCH 168/247] pinctrl: pxa: pxa2xx: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit 0b9335cbd38e3bd2025bcc23b5758df4ac035f75 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/pinctrl/pxa/pinctrl-pxa2xx.o see include/linux/module.h for more information This adds the license as "GPL v2", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pxa/pinctrl-pxa2xx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c index 866aa3ce1ac95..6cf0006d4c8df 100644 --- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c +++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c @@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev) return 0; } EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit); + +MODULE_AUTHOR("Robert Jarzmik "); +MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver"); +MODULE_LICENSE("GPL v2"); From 374c84de94af798cad81b5e09b81e901966a4eb0 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Sun, 19 Nov 2017 23:45:49 -0800 Subject: [PATCH 169/247] ASoC: pcm512x: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit 0cab20cec0b663b7be8e2be5998d5a4113647f86 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in sound/soc/codecs/snd-soc-pcm512x-spi.o see include/linux/module.h for more information This adds the license as "GPL v2", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/pcm512x-spi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c index 712ed6598c482..ebdf9bd5a64c5 100644 --- a/sound/soc/codecs/pcm512x-spi.c +++ b/sound/soc/codecs/pcm512x-spi.c @@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = { }; module_spi_driver(pcm512x_spi_driver); + +MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI"); +MODULE_AUTHOR("Mark Brown "); +MODULE_LICENSE("GPL v2"); From 0a61cd6caed72ecac8cbb637f6498e17a33d73b1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 29 Jan 2018 18:16:55 -0800 Subject: [PATCH 170/247] kaiser: fix intel_bts perf crashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vince reported perf_fuzzer quickly locks up on 4.15-rc7 with PTI; Robert reported Bad RIP with KPTI and Intel BTS also on 4.15-rc7: honggfuzz -f /tmp/somedirectorywithatleastonefile \ --linux_perf_bts_edge -s -- /bin/true (honggfuzz from https://github.com/google/honggfuzz) crashed with BUG: unable to handle kernel paging request at ffff9d3215100000 (then narrowed it down to perf record --per-thread -e intel_bts//u -- /bin/ls). The intel_bts driver does not use the 'normal' BTS buffer which is exposed through kaiser_add_mapping(), but instead uses the memory allocated for the perf AUX buffer. This obviously comes apart when using PTI, because then the kernel mapping, which includes that AUX buffer memory, disappears while switched to user page tables. Easily fixed in old-Kaiser backports, by applying kaiser_add_mapping() to those pages; perhaps not so easy for upstream, where 4.15-rc8 commit 99a9dc98ba52 ("x86,perf: Disable intel_bts when PTI") disables for now. Slightly reorganized surrounding code in bts_buffer_setup_aux(), so it can better match bts_buffer_free_aux(): free_aux with an #ifdef to avoid the loop when PTI is off, but setup_aux needs to loop anyway (and kaiser_add_mapping() is cheap when PTI config is off or "pti=off"). Reported-by: Vince Weaver Reported-by: Robert Święcki Analyzed-by: Peter Zijlstra Analyzed-by: Stephane Eranian Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Andy Lutomirski Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Vince Weaver Cc: Jiri Kosina Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/intel/bts.c | 44 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 982c9e31daca4..21298c173b0e0 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,23 @@ static size_t buf_size(struct page *page) return 1 << (PAGE_SHIFT + page_private(page)); } +static void bts_buffer_free_aux(void *data) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct bts_buffer *buf = data; + int nbuf; + + for (nbuf = 0; nbuf < buf->nr_bufs; nbuf++) { + struct page *page = buf->buf[nbuf].page; + void *kaddr = page_address(page); + size_t page_size = buf_size(page); + + kaiser_remove_mapping((unsigned long)kaddr, page_size); + } +#endif + kfree(data); +} + static void * bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) { @@ -113,29 +131,33 @@ bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) buf->real_size = size - size % BTS_RECORD_SIZE; for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { - unsigned int __nr_pages; + void *kaddr = pages[pg]; + size_t page_size; + + page = virt_to_page(kaddr); + page_size = buf_size(page); + + if (kaiser_add_mapping((unsigned long)kaddr, + page_size, __PAGE_KERNEL) < 0) { + buf->nr_bufs = nbuf; + bts_buffer_free_aux(buf); + return NULL; + } - page = virt_to_page(pages[pg]); - __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; buf->buf[nbuf].page = page; buf->buf[nbuf].offset = offset; buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); - buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; + buf->buf[nbuf].size = page_size - buf->buf[nbuf].displacement; pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; buf->buf[nbuf].size -= pad; - pg += __nr_pages; - offset += __nr_pages << PAGE_SHIFT; + pg += page_size >> PAGE_SHIFT; + offset += page_size; } return buf; } -static void bts_buffer_free_aux(void *data) -{ - kfree(data); -} - static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) { return buf->buf[idx].offset + buf->buf[idx].displacement; From ae1fc8de51b10dba40cbd54959b5ba0a311c0861 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 29 Jan 2018 18:17:26 -0800 Subject: [PATCH 171/247] x86/pti: Make unpoison of pgd for trusted boot work for real commit 445b69e3b75e42362a5bdc13c8b8f61599e2228a upstream The inital fix for trusted boot and PTI potentially misses the pgd clearing if pud_alloc() sets a PGD. It probably works in *practice* because for two adjacent calls to map_tboot_page() that share a PGD entry, the first will clear NX, *then* allocate and set the PGD (without NX clear). The second call will *not* allocate but will clear the NX bit. Defer the NX clearing to a point after it is known that all top-level allocations have occurred. Add a comment to clarify why. [ tglx: Massaged changelog ] [ hughd notes: I have not tested tboot, but this looks to me as necessary and as safe in old-Kaiser backports as it is upstream; I'm not submitting the commit-to-be-fixed 262b6b30087, since it was undone by 445b69e3b75e, and makes conflict trouble because of 5-level's p4d versus 4-level's pgd.] Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled") Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Reviewed-by: Andrea Arcangeli Cc: Jon Masters Cc: Tim Chen Cc: gnomes@lxorguk.ukuu.org.uk Cc: peterz@infradead.org Cc: ning.sun@intel.com Cc: tboot-devel@lists.sourceforge.net Cc: andi@firstfloor.org Cc: luto@kernel.org Cc: law@redhat.com Cc: pbonzini@redhat.com Cc: torvalds@linux-foundation.org Cc: gregkh@linux-foundation.org Cc: dwmw@amazon.co.uk Cc: nickc@redhat.com Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com Cc: Jiri Kosina Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/tboot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 8402907825b02..21454e254a4cc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -134,6 +134,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); pte_unmap(pte); + + /* + * PTI poisons low addresses in the kernel page tables in the + * name of making them unusable for userspace. To execute + * code at such a low address, the poison must be cleared. + * + * Note: 'pgd' actually gets set in pud_alloc(). + */ + pgd->pgd &= ~_PAGE_NX; + return 0; } From 400d3c8b0c7f6c942cd9bf201da1f32da9bdcc9f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 29 Jan 2018 18:17:58 -0800 Subject: [PATCH 172/247] kaiser: allocate pgd with order 0 when pti=off The 4.9.77 version of "x86/pti/efi: broken conversion from efi to kernel page table" looked nicer than the 4.4.112 version, but was suboptimal on machines booted with "pti=off" (or on AMD machines): it allocated pgd with an order 1 page whatever the setting of kaiser_enabled. Fix that by moving the definition of PGD_ALLOCATION_ORDER from asm/pgalloc.h to asm/pgtable.h, which already defines kaiser_enabled. Fixes: 1b92c48a2eeb ("x86/pti/efi: broken conversion from efi to kernel page table") Reviewed-by: Pavel Tatashin Cc: Steven Sistare Cc: Jiri Kosina Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/pgalloc.h | 11 ----------- arch/x86/include/asm/pgtable.h | 6 ++++++ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 1178a51b77f31..b6d425999f99d 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -27,17 +27,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {} */ extern gfp_t __userpte_alloc_gfp; -#ifdef CONFIG_PAGE_TABLE_ISOLATION -/* - * Instead of one PGD, we acquire two PGDs. Being order-1, it is - * both 8k in size and 8k-aligned. That lets us just flip bit 12 - * in a pointer to swap between the two 4k halves. - */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif - /* * Allocate and free page tables. */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2536f90cd30c5..5af0401ccff2c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -20,9 +20,15 @@ #ifdef CONFIG_PAGE_TABLE_ISOLATION extern int kaiser_enabled; +/* + * Instead of one PGD, we acquire two PGDs. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ #else #define kaiser_enabled 0 #endif +#define PGD_ALLOCATION_ORDER kaiser_enabled void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); From ffcf167d348ee5e8ddbe2c4116b71ebfff2258e0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 11 Jan 2018 18:57:26 +0100 Subject: [PATCH 173/247] serial: core: mark port as initialized after successful IRQ change commit 44117a1d1732c513875d5a163f10d9adbe866c08 upstream. setserial changes the IRQ via uart_set_info(). It invokes uart_shutdown() which free the current used IRQ and clear TTY_PORT_INITIALIZED. It will then update the IRQ number and invoke uart_startup() before returning to the caller leaving TTY_PORT_INITIALIZED cleared. The next open will crash with | list_add double add: new=ffffffff839fcc98, prev=ffffffff839fcc98, next=ffffffff839fcc98. since the close from the IOCTL won't free the IRQ (and clean the list) due to the TTY_PORT_INITIALIZED check in uart_shutdown(). There is same pattern in uart_do_autoconfig() and I *think* it also needs to set TTY_PORT_INITIALIZED there. Is there a reason why uart_startup() does not set the flag by itself after the IRQ has been acquired (since it is cleared in uart_shutdown)? Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index f2303f390345e..23973a8124fcc 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -965,6 +965,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, } } else { retval = uart_startup(tty, state, 1); + if (retval == 0) + tty_port_set_initialized(port, true); if (retval > 0) retval = 0; } From 7d3d60ef2256f5fcfd4929963dd67f58028ebeee Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 31 Jan 2018 16:29:30 +0200 Subject: [PATCH 174/247] ip6mr: fix stale iterator [ Upstream commit 4adfa79fc254efb7b0eb3cd58f62c2c3f805f1ba ] When we dump the ip6mr mfc entries via proc, we initialize an iterator with the table to dump but we don't clear the cache pointer which might be initialized from a prior read on the same descriptor that ended. This can result in lock imbalance (an unnecessary unlock) leading to other crashes and hangs. Clear the cache pointer like ipmr does to fix the issue. Thanks for the reliable reproducer. Here's syzbot's trace: WARNING: bad unlock balance detected! 4.15.0-rc3+ #128 Not tainted syzkaller971460/3195 is trying to release lock (mrt_lock) at: [<000000006898068d>] ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553 but there are no more locks to release! other info that might help us debug this: 1 lock held by syzkaller971460/3195: #0: (&p->lock){+.+.}, at: [<00000000744a6565>] seq_read+0xd5/0x13d0 fs/seq_file.c:165 stack backtrace: CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_unlock_imbalance_bug+0x12f/0x140 kernel/locking/lockdep.c:3561 __lock_release kernel/locking/lockdep.c:3775 [inline] lock_release+0x5f9/0xda0 kernel/locking/lockdep.c:4023 __raw_read_unlock include/linux/rwlock_api_smp.h:225 [inline] _raw_read_unlock+0x1a/0x30 kernel/locking/spinlock.c:255 ipmr_mfc_seq_stop+0xe1/0x130 net/ipv6/ip6mr.c:553 traverse+0x3bc/0xa00 fs/seq_file.c:135 seq_read+0x96a/0x13d0 fs/seq_file.c:189 proc_reg_read+0xef/0x170 fs/proc/inode.c:217 do_loop_readv_writev fs/read_write.c:673 [inline] do_iter_read+0x3db/0x5b0 fs/read_write.c:897 compat_readv+0x1bf/0x270 fs/read_write.c:1140 do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189 C_SYSC_preadv fs/read_write.c:1209 [inline] compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125 RIP: 0023:0xf7f73c79 RSP: 002b:00000000e574a15c EFLAGS: 00000292 ORIG_RAX: 000000000000014d RAX: ffffffffffffffda RBX: 000000000000000f RCX: 0000000020a3afb0 RDX: 0000000000000001 RSI: 0000000000000067 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 BUG: sleeping function called from invalid context at lib/usercopy.c:25 in_atomic(): 1, irqs_disabled(): 0, pid: 3195, name: syzkaller971460 INFO: lockdep is turned off. CPU: 1 PID: 3195 Comm: syzkaller971460 Not tainted 4.15.0-rc3+ #128 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6060 __might_sleep+0x95/0x190 kernel/sched/core.c:6013 __might_fault+0xab/0x1d0 mm/memory.c:4525 _copy_to_user+0x2c/0xc0 lib/usercopy.c:25 copy_to_user include/linux/uaccess.h:155 [inline] seq_read+0xcb4/0x13d0 fs/seq_file.c:279 proc_reg_read+0xef/0x170 fs/proc/inode.c:217 do_loop_readv_writev fs/read_write.c:673 [inline] do_iter_read+0x3db/0x5b0 fs/read_write.c:897 compat_readv+0x1bf/0x270 fs/read_write.c:1140 do_compat_preadv64+0xdc/0x100 fs/read_write.c:1189 C_SYSC_preadv fs/read_write.c:1209 [inline] compat_SyS_preadv+0x3b/0x50 fs/read_write.c:1203 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x51/0x60 arch/x86/entry/entry_64_compat.S:125 RIP: 0023:0xf7f73c79 RSP: 002b:00000000e574a15c EFLAGS: 00000292 ORIG_RAX: 000000000000014d RAX: ffffffffffffffda RBX: 000000000000000f RCX: 0000000020a3afb0 RDX: 0000000000000001 RSI: 0000000000000067 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 WARNING: CPU: 1 PID: 3195 at lib/usercopy.c:26 _copy_to_user+0xb5/0xc0 lib/usercopy.c:26 Reported-by: syzbot Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6mr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 117405dd07a3c..a30e7e925c9b7 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -495,6 +495,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) return ERR_PTR(-ENOENT); it->mrt = mrt; + it->cache = NULL; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } From ce43c07fcef8cf5cdbc7a45db7a6b4a1a7473905 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 1 Feb 2018 10:26:57 -0800 Subject: [PATCH 175/247] net: igmp: add a missing rcu locking section [ Upstream commit e7aadb27a5415e8125834b84a74477bfbee4eff5 ] Newly added igmpv3_get_srcaddr() needs to be called under rcu lock. Timer callbacks do not ensure this locking. ============================= WARNING: suspicious RCU usage 4.15.0+ #200 Not tainted ----------------------------- ./include/linux/inetdevice.h:216 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syzkaller616973/4074: #0: (&mm->mmap_sem){++++}, at: [<00000000bfce669e>] __do_page_fault+0x32d/0xc90 arch/x86/mm/fault.c:1355 #1: ((&im->timer)){+.-.}, at: [<00000000619d2f71>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #1: ((&im->timer)){+.-.}, at: [<00000000619d2f71>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1316 #2: (&(&im->lock)->rlock){+.-.}, at: [<000000005f833c5c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #2: (&(&im->lock)->rlock){+.-.}, at: [<000000005f833c5c>] igmpv3_send_report+0x98/0x5b0 net/ipv4/igmp.c:600 stack backtrace: CPU: 0 PID: 4074 Comm: syzkaller616973 Not tainted 4.15.0+ #200 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4592 __in_dev_get_rcu include/linux/inetdevice.h:216 [inline] igmpv3_get_srcaddr net/ipv4/igmp.c:329 [inline] igmpv3_newpack+0xeef/0x12e0 net/ipv4/igmp.c:389 add_grhead.isra.27+0x235/0x300 net/ipv4/igmp.c:432 add_grec+0xbd3/0x1170 net/ipv4/igmp.c:565 igmpv3_send_report+0xd5/0x5b0 net/ipv4/igmp.c:605 igmp_send_report+0xc43/0x1050 net/ipv4/igmp.c:722 igmp_timer_expire+0x322/0x5c0 net/ipv4/igmp.c:831 call_timer_fn+0x228/0x820 kernel/time/timer.c:1326 expire_timers kernel/time/timer.c:1363 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1666 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:541 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:938 Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/igmp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9c7a4cea1628b..7f5fe07d0b13a 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -386,7 +386,11 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; + + rcu_read_lock(); pip->saddr = igmpv3_get_srcaddr(dev, &fl4); + rcu_read_unlock(); + pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); From 97fe899816a68e3970b877e766bcb8d66013076b Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 29 Jan 2018 17:53:42 +0800 Subject: [PATCH 176/247] qlcnic: fix deadlock bug [ Upstream commit 233ac3891607f501f08879134d623b303838f478 ] The following soft lockup was caught. This is a deadlock caused by recusive locking. Process kworker/u40:1:28016 was holding spin lock "mbx->queue_lock" in qlcnic_83xx_mailbox_worker(), while a softirq came in and ask the same spin lock in qlcnic_83xx_enqueue_mbx_cmd(). This lock should be hold by disable bh.. [161846.962125] NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/u40:1:28016] [161846.962367] Modules linked in: tun ocfs2 xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn xenfs xen_privcmd autofs4 ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs bnx2fc fcoe libfcoe libfc sunrpc 8021q mrp garp bridge stp llc bonding dm_round_robin dm_multipath iTCO_wdt iTCO_vendor_support pcspkr sb_edac edac_core i2c_i801 shpchp lpc_ich mfd_core ioatdma ipmi_devintf ipmi_si ipmi_msghandler sg ext4 jbd2 mbcache2 sr_mod cdrom sd_mod igb i2c_algo_bit i2c_core ahci libahci megaraid_sas ixgbe dca ptp pps_core vxlan udp_tunnel ip6_udp_tunnel qla2xxx scsi_transport_fc qlcnic crc32c_intel be2iscsi bnx2i cnic uio cxgb4i cxgb4 cxgb3i libcxgbi ipv6 cxgb3 mdio libiscsi_tcp qla4xxx iscsi_boot_sysfs libiscsi scsi_transport_iscsi dm_mirror dm_region_hash dm_log dm_mod [161846.962454] [161846.962460] CPU: 1 PID: 28016 Comm: kworker/u40:1 Not tainted 4.1.12-94.5.9.el6uek.x86_64 #2 [161846.962463] Hardware name: Oracle Corporation SUN SERVER X4-2L /ASSY,MB,X4-2L , BIOS 26050100 09/19/2017 [161846.962489] Workqueue: qlcnic_mailbox qlcnic_83xx_mailbox_worker [qlcnic] [161846.962493] task: ffff8801f2e34600 ti: ffff88004ca5c000 task.ti: ffff88004ca5c000 [161846.962496] RIP: e030:[] [] xen_hypercall_sched_op+0xa/0x20 [161846.962506] RSP: e02b:ffff880202e43388 EFLAGS: 00000206 [161846.962509] RAX: 0000000000000000 RBX: ffff8801f6996b70 RCX: ffffffff810013aa [161846.962511] RDX: ffff880202e433cc RSI: ffff880202e433b0 RDI: 0000000000000003 [161846.962513] RBP: ffff880202e433d0 R08: 0000000000000000 R09: ffff8801fe893200 [161846.962516] R10: ffff8801fe400538 R11: 0000000000000206 R12: ffff880202e4b000 [161846.962518] R13: 0000000000000050 R14: 0000000000000001 R15: 000000000000020d [161846.962528] FS: 0000000000000000(0000) GS:ffff880202e40000(0000) knlGS:ffff880202e40000 [161846.962531] CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033 [161846.962533] CR2: 0000000002612640 CR3: 00000001bb796000 CR4: 0000000000042660 [161846.962536] Stack: [161846.962538] ffff880202e43608 0000000000000000 ffffffff813f0442 ffff880202e433b0 [161846.962543] 0000000000000000 ffff880202e433cc ffffffff00000001 0000000000000000 [161846.962547] 00000009813f03d6 ffff880202e433e0 ffffffff813f0460 ffff880202e43440 [161846.962552] Call Trace: [161846.962555] [161846.962565] [] ? xen_poll_irq_timeout+0x42/0x50 [161846.962570] [] xen_poll_irq+0x10/0x20 [161846.962578] [] xen_lock_spinning+0xe2/0x110 [161846.962583] [] __raw_callee_save_xen_lock_spinning+0x11/0x20 [161846.962592] [] ? _raw_spin_lock+0x57/0x80 [161846.962609] [] qlcnic_83xx_enqueue_mbx_cmd+0x7c/0xe0 [qlcnic] [161846.962623] [] qlcnic_83xx_issue_cmd+0x58/0x210 [qlcnic] [161846.962636] [] qlcnic_83xx_sre_macaddr_change+0x162/0x1d0 [qlcnic] [161846.962649] [] qlcnic_83xx_change_l2_filter+0x2b/0x30 [qlcnic] [161846.962657] [] ? __skb_flow_dissect+0x18b/0x650 [161846.962670] [] qlcnic_send_filter+0x205/0x250 [qlcnic] [161846.962682] [] qlcnic_xmit_frame+0x547/0x7b0 [qlcnic] [161846.962691] [] xmit_one+0x82/0x1a0 [161846.962696] [] dev_hard_start_xmit+0x50/0xa0 [161846.962701] [] sch_direct_xmit+0x112/0x220 [161846.962706] [] __dev_queue_xmit+0x1df/0x5e0 [161846.962710] [] dev_queue_xmit_sk+0x13/0x20 [161846.962721] [] bond_dev_queue_xmit+0x35/0x80 [bonding] [161846.962729] [] __bond_start_xmit+0x1cb/0x210 [bonding] [161846.962736] [] bond_start_xmit+0x31/0x60 [bonding] [161846.962740] [] xmit_one+0x82/0x1a0 [161846.962745] [] dev_hard_start_xmit+0x50/0xa0 [161846.962749] [] __dev_queue_xmit+0x4ee/0x5e0 [161846.962754] [] dev_queue_xmit_sk+0x13/0x20 [161846.962760] [] vlan_dev_hard_start_xmit+0xb2/0x150 [8021q] [161846.962764] [] xmit_one+0x82/0x1a0 [161846.962769] [] dev_hard_start_xmit+0x50/0xa0 [161846.962773] [] __dev_queue_xmit+0x4ee/0x5e0 [161846.962777] [] dev_queue_xmit_sk+0x13/0x20 [161846.962789] [] br_dev_queue_push_xmit+0x54/0xa0 [bridge] [161846.962797] [] br_forward_finish+0x2f/0x90 [bridge] [161846.962807] [] ? ttwu_do_wakeup+0x1d/0x100 [161846.962811] [] ? __alloc_skb+0x8b/0x1f0 [161846.962818] [] __br_forward+0x8d/0x120 [bridge] [161846.962822] [] ? __kmalloc_reserve+0x3b/0xa0 [161846.962829] [] ? update_rq_runnable_avg+0xee/0x230 [161846.962836] [] br_forward+0x96/0xb0 [bridge] [161846.962845] [] br_handle_frame_finish+0x1ae/0x420 [bridge] [161846.962853] [] br_handle_frame+0x17f/0x260 [bridge] [161846.962862] [] ? br_handle_frame_finish+0x420/0x420 [bridge] [161846.962867] [] __netif_receive_skb_core+0x1f7/0x870 [161846.962872] [] __netif_receive_skb+0x22/0x70 [161846.962877] [] netif_receive_skb_internal+0x23/0x90 [161846.962884] [] ? xenvif_idx_release+0xea/0x100 [xen_netback] [161846.962889] [] ? _raw_spin_unlock_irqrestore+0x20/0x50 [161846.962893] [] netif_receive_skb_sk+0x24/0x90 [161846.962899] [] xenvif_tx_submit+0x2ca/0x3f0 [xen_netback] [161846.962906] [] xenvif_tx_action+0x9c/0xd0 [xen_netback] [161846.962915] [] xenvif_poll+0x35/0x70 [xen_netback] [161846.962920] [] napi_poll+0xcb/0x1e0 [161846.962925] [] net_rx_action+0x90/0x1c0 [161846.962931] [] __do_softirq+0x10a/0x350 [161846.962938] [] irq_exit+0x125/0x130 [161846.962943] [] xen_evtchn_do_upcall+0x39/0x50 [161846.962950] [] xen_do_hypervisor_callback+0x1e/0x40 [161846.962952] [161846.962959] [] ? _raw_spin_lock+0x4a/0x80 [161846.962964] [] ? _raw_spin_lock_irqsave+0x1e/0xa0 [161846.962978] [] ? qlcnic_83xx_mailbox_worker+0xb9/0x2a0 [qlcnic] [161846.962991] [] ? process_one_work+0x151/0x4b0 [161846.962995] [] ? check_events+0x12/0x20 [161846.963001] [] ? worker_thread+0x120/0x480 [161846.963005] [] ? __schedule+0x30b/0x890 [161846.963010] [] ? process_one_work+0x4b0/0x4b0 [161846.963015] [] ? process_one_work+0x4b0/0x4b0 [161846.963021] [] ? kthread+0xce/0xf0 [161846.963025] [] ? kthread_freezable_should_stop+0x70/0x70 [161846.963031] [] ? ret_from_fork+0x42/0x70 [161846.963035] [] ? kthread_freezable_should_stop+0x70/0x70 [161846.963037] Code: cc 51 41 53 b8 1c 00 00 00 0f 05 41 5b 59 c3 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 51 41 53 b8 1d 00 00 00 0f 05 <41> 5b 59 c3 cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc Signed-off-by: Junxiao Bi Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- .../ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index bdbcd2b088a01..c3c28f0960e50 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -3849,7 +3849,7 @@ static void qlcnic_83xx_flush_mbx_queue(struct qlcnic_adapter *adapter) struct list_head *head = &mbx->cmd_q; struct qlcnic_cmd_args *cmd = NULL; - spin_lock(&mbx->queue_lock); + spin_lock_bh(&mbx->queue_lock); while (!list_empty(head)) { cmd = list_entry(head->next, struct qlcnic_cmd_args, list); @@ -3860,7 +3860,7 @@ static void qlcnic_83xx_flush_mbx_queue(struct qlcnic_adapter *adapter) qlcnic_83xx_notify_cmd_completion(adapter, cmd); } - spin_unlock(&mbx->queue_lock); + spin_unlock_bh(&mbx->queue_lock); } static int qlcnic_83xx_check_mbx_status(struct qlcnic_adapter *adapter) @@ -3896,12 +3896,12 @@ static void qlcnic_83xx_dequeue_mbx_cmd(struct qlcnic_adapter *adapter, { struct qlcnic_mailbox *mbx = adapter->ahw->mailbox; - spin_lock(&mbx->queue_lock); + spin_lock_bh(&mbx->queue_lock); list_del(&cmd->list); mbx->num_cmds--; - spin_unlock(&mbx->queue_lock); + spin_unlock_bh(&mbx->queue_lock); qlcnic_83xx_notify_cmd_completion(adapter, cmd); } @@ -3966,7 +3966,7 @@ static int qlcnic_83xx_enqueue_mbx_cmd(struct qlcnic_adapter *adapter, init_completion(&cmd->completion); cmd->rsp_opcode = QLC_83XX_MBX_RESPONSE_UNKNOWN; - spin_lock(&mbx->queue_lock); + spin_lock_bh(&mbx->queue_lock); list_add_tail(&cmd->list, &mbx->cmd_q); mbx->num_cmds++; @@ -3974,7 +3974,7 @@ static int qlcnic_83xx_enqueue_mbx_cmd(struct qlcnic_adapter *adapter, *timeout = cmd->total_cmds * QLC_83XX_MBX_TIMEOUT; queue_work(mbx->work_q, &mbx->work); - spin_unlock(&mbx->queue_lock); + spin_unlock_bh(&mbx->queue_lock); return 0; } @@ -4070,15 +4070,15 @@ static void qlcnic_83xx_mailbox_worker(struct work_struct *work) mbx->rsp_status = QLC_83XX_MBX_RESPONSE_WAIT; spin_unlock_irqrestore(&mbx->aen_lock, flags); - spin_lock(&mbx->queue_lock); + spin_lock_bh(&mbx->queue_lock); if (list_empty(head)) { - spin_unlock(&mbx->queue_lock); + spin_unlock_bh(&mbx->queue_lock); return; } cmd = list_entry(head->next, struct qlcnic_cmd_args, list); - spin_unlock(&mbx->queue_lock); + spin_unlock_bh(&mbx->queue_lock); mbx_ops->encode_cmd(adapter, cmd); mbx_ops->nofity_fw(adapter, QLC_83XX_MBX_REQUEST); From 9f2f873d5a1c3f6b22b18db0e4c9f3a30164b8f0 Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Tue, 30 Jan 2018 14:12:55 +0100 Subject: [PATCH 177/247] qmi_wwan: Add support for Quectel EP06 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c0b91a56a2e57a5a370655b25d677ae0ebf8a2d0 ] The Quectel EP06 is a Cat. 6 LTE modem. It uses the same interface as the EC20/EC25 for QMI, and requires the same "set DTR"-quirk to work. Signed-off-by: Kristian Evensen Acked-by: Bjørn Mork Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index db65d9ad44881..e1e5e84384573 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -944,6 +944,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0306, 4)}, /* Quectel EP06 Mini PCIe */ /* 4. Gobi 1000 devices */ {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */ From 5db5cabbf09dd793bf42e65cea9abe66fad62c13 Mon Sep 17 00:00:00 2001 From: Chunhao Lin Date: Wed, 31 Jan 2018 01:32:36 +0800 Subject: [PATCH 178/247] r8169: fix RTL8168EP take too long to complete driver initialization. [ Upstream commit 086ca23d03c0d2f4088f472386778d293e15c5f6 ] Driver check the wrong register bit in rtl_ocp_tx_cond() that keep driver waiting until timeout. Fix this by waiting for the right register bit. Signed-off-by: Chunhao Lin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 298b74ebc1e90..18e68c91e6513 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -1387,7 +1387,7 @@ DECLARE_RTL_COND(rtl_ocp_tx_cond) { void __iomem *ioaddr = tp->mmio_addr; - return RTL_R8(IBISR0) & 0x02; + return RTL_R8(IBISR0) & 0x20; } static void rtl8168ep_stop_cmac(struct rtl8169_private *tp) @@ -1395,7 +1395,7 @@ static void rtl8168ep_stop_cmac(struct rtl8169_private *tp) void __iomem *ioaddr = tp->mmio_addr; RTL_W8(IBCR2, RTL_R8(IBCR2) & ~0x01); - rtl_msleep_loop_wait_low(tp, &rtl_ocp_tx_cond, 50, 2000); + rtl_msleep_loop_wait_high(tp, &rtl_ocp_tx_cond, 50, 2000); RTL_W8(IBISR0, RTL_R8(IBISR0) | 0x20); RTL_W8(IBCR0, RTL_R8(IBCR0) & ~0x01); } From ee46a8614204e136e7ec6426e9c696bd21901405 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 26 Jan 2018 16:40:41 +0800 Subject: [PATCH 179/247] tcp: release sk_frag.page in tcp_disconnect [ Upstream commit 9b42d55a66d388e4dd5550107df051a9637564fc ] socket can be disconnected and gets transformed back to a listening socket, if sk_frag.page is not released, which will be cloned into a new socket by sk_clone_lock, but the reference count of this page is increased, lead to a use after free or double free issue Signed-off-by: Li RongQing Cc: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7efa6b062049a..0d1a767db1bb0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2316,6 +2316,12 @@ int tcp_disconnect(struct sock *sk, int flags) WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + sk->sk_error_report(sk); return err; } From 73adb3b74efd2809735d2063b8fdaaf762bdd071 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 25 Jan 2018 22:03:52 +0800 Subject: [PATCH 180/247] vhost_net: stop device during reset owner [ Upstream commit 4cd879515d686849eec5f718aeac62a70b067d82 ] We don't stop device before reset owner, this means we could try to serve any virtqueue kick before reset dev->worker. This will result a warn since the work was pending at llist during owner resetting. Fix this by stopping device during owner reset. Reported-by: syzbot+eb17c6162478cc50632c@syzkaller.appspotmail.com Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server") Signed-off-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 96a0661011fdf..e5b7652234fcf 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1078,6 +1078,7 @@ static long vhost_net_reset_owner(struct vhost_net *n) } vhost_net_stop(n, &tx_sock, &rx_sock); vhost_net_flush(n); + vhost_dev_stop(&n->dev); vhost_dev_reset_owner(&n->dev, umem); vhost_net_vq_reset(n); done: From b980f718f52561b58072e0e675320fd21d71b94a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 31 Jan 2018 15:43:05 -0500 Subject: [PATCH 181/247] tcp_bbr: fix pacing_gain to always be unity when using lt_bw [ Upstream commit 3aff3b4b986e51bcf4ab249e5d48d39596e0df6a ] This commit fixes the pacing_gain to remain at BBR_UNIT (1.0) when using lt_bw and returning from the PROBE_RTT state to PROBE_BW. Previously, when using lt_bw, upon exiting PROBE_RTT and entering PROBE_BW the bbr_reset_probe_bw_mode() code could sometimes randomly end up with a cycle_idx of 0 and hence have bbr_advance_cycle_phase() set a pacing gain above 1.0. In such cases this would result in a pacing rate that is 1.25x higher than intended, potentially resulting in a high loss rate for a little while until we stop using the lt_bw a bit later. This commit is a stable candidate for kernels back as far as 4.9. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Reported-by: Beyers Cronje Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_bbr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index e86a34fd54847..8ec60532be2b6 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -452,7 +452,8 @@ static void bbr_advance_cycle_phase(struct sock *sk) bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); bbr->cycle_mstamp = tp->delivered_mstamp; - bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; + bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : + bbr_pacing_gain[bbr->cycle_idx]; } /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ @@ -461,8 +462,7 @@ static void bbr_update_cycle_phase(struct sock *sk, { struct bbr *bbr = inet_csk_ca(sk); - if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && - bbr_is_next_cycle_phase(sk, rs)) + if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) bbr_advance_cycle_phase(sk); } From fa46d1437fabe9da3a3c084cef6ac310ecadb9f8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 2 Feb 2018 16:02:22 +0100 Subject: [PATCH 182/247] cls_u32: add missing RCU annotation. [ Upstream commit 058a6c033488494a6b1477b05fe8e1a16e344462 ] In a couple of points of the control path, n->ht_down is currently accessed without the required RCU annotation. The accesses are safe, but sparse complaints. Since we already held the rtnl lock, let use rtnl_dereference(). Fixes: a1b7c5fd7fe9 ("net: sched: add cls_u32 offload hooks for netdevs") Fixes: de5df63228fc ("net: sched: cls_u32 changes to knode must appear atomic to readers") Signed-off-by: Paolo Abeni Acked-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/cls_u32.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ae83c3aec3082..da574a16e7b36 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -496,6 +496,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags) { + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; struct tc_to_netdev offload; @@ -520,7 +521,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, offload.cls_u32->knode.sel = &n->sel; offload.cls_u32->knode.exts = &n->exts; if (n->ht_down) - offload.cls_u32->knode.link_handle = n->ht_down->handle; + offload.cls_u32->knode.link_handle = ht->handle; err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &offload); @@ -788,8 +789,9 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, struct tc_u_knode *n) { - struct tc_u_knode *new; + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tc_u32_sel *s = &n->sel; + struct tc_u_knode *new; new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); @@ -807,11 +809,11 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, new->fshift = n->fshift; new->res = n->res; new->flags = n->flags; - RCU_INIT_POINTER(new->ht_down, n->ht_down); + RCU_INIT_POINTER(new->ht_down, ht); /* bump reference count as long as we hold pointer to structure */ - if (new->ht_down) - new->ht_down->refcnt++; + if (ht) + ht->refcnt++; #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update From 5771415d24bf66f199a4a16d900d2c3afaaa6423 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Jan 2018 23:15:27 -0800 Subject: [PATCH 183/247] ipv6: Fix SO_REUSEPORT UDP socket with implicit sk_ipv6only [ Upstream commit 7ece54a60ee2ba7a386308cae73c790bd580589c ] If a sk_v6_rcv_saddr is !IPV6_ADDR_ANY and !IPV6_ADDR_MAPPED, it implicitly implies it is an ipv6only socket. However, in inet6_bind(), this addr_type checking and setting sk->sk_ipv6only to 1 are only done after sk->sk_prot->get_port(sk, snum) has been completed successfully. This inconsistency between sk_v6_rcv_saddr and sk_ipv6only confuses the 'get_port()'. In particular, when binding SO_REUSEPORT UDP sockets, udp_reuseport_add_sock(sk,...) is called. udp_reuseport_add_sock() checks "ipv6_only_sock(sk2) == ipv6_only_sock(sk)" before adding sk to sk2->sk_reuseport_cb. In this case, ipv6_only_sock(sk2) could be 1 while ipv6_only_sock(sk) is still 0 here. The end result is, reuseport_alloc(sk) is called instead of adding sk to the existing sk2->sk_reuseport_cb. It can be reproduced by binding two SO_REUSEPORT UDP sockets on an IPv6 address (!ANY and !MAPPED). Only one of the socket will receive packet. The fix is to set the implicit sk_ipv6only before calling get_port(). The original sk_ipv6only has to be saved such that it can be restored in case get_port() failed. The situation is similar to the inet_reset_saddr(sk) after get_port() has failed. Thanks to Calvin Owens who created an easy reproduction which leads to a fix. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/af_inet6.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5cad76f875362..4213790149959 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -274,6 +274,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); __be32 v4addr = 0; unsigned short snum; + bool saved_ipv6only; int addr_type = 0; int err = 0; @@ -378,19 +379,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; + saved_ipv6only = sk->sk_ipv6only; + if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED) + sk->sk_ipv6only = 1; + /* Make sure we are allowed to bind here. */ if ((snum || !inet->bind_address_no_port) && sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } - if (addr_type != IPV6_ADDR_ANY) { + if (addr_type != IPV6_ADDR_ANY) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; - if (addr_type != IPV6_ADDR_MAPPED) - sk->sk_ipv6only = 1; - } if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); From b671f40419bb0d59c5af69c17af5a86bc467a273 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Feb 2018 10:27:27 -0800 Subject: [PATCH 184/247] soreuseport: fix mem leak in reuseport_add_sock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4db428a7c9ab07e08783e0fcdc4ca0f555da0567 ] reuseport_add_sock() needs to deal with attaching a socket having its own sk_reuseport_cb, after a prior setsockopt(SO_ATTACH_REUSEPORT_?BPF) Without this fix, not only a WARN_ONCE() was issued, but we were also leaking memory. Thanks to sysbot and Eric Biggers for providing us nice C repros. ------------[ cut here ]------------ socket already in reuseport group WARNING: CPU: 0 PID: 3496 at net/core/sock_reuseport.c:119   reuseport_add_sock+0x742/0x9b0 net/core/sock_reuseport.c:117 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 3496 Comm: syzkaller869503 Not tainted 4.15.0-rc6+ #245 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS   Google 01/01/2011 Call Trace:   __dump_stack lib/dump_stack.c:17 [inline]   dump_stack+0x194/0x257 lib/dump_stack.c:53   panic+0x1e4/0x41c kernel/panic.c:183   __warn+0x1dc/0x200 kernel/panic.c:547   report_bug+0x211/0x2d0 lib/bug.c:184   fixup_bug.part.11+0x37/0x80 arch/x86/kernel/traps.c:178   fixup_bug arch/x86/kernel/traps.c:247 [inline]   do_error_trap+0x2d7/0x3e0 arch/x86/kernel/traps.c:296   do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315   invalid_op+0x22/0x40 arch/x86/entry/entry_64.S:1079 Fixes: ef456144da8e ("soreuseport: define reuseport groups") Signed-off-by: Eric Dumazet Reported-by: syzbot+c0ea2226f77a42936bf7@syzkaller.appspotmail.com Acked-by: Craig Gallek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/sock_reuseport.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 77f396b679ce7..5dce4291f0ed0 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -93,6 +93,16 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) return more_reuse; } +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + /** * reuseport_add_sock - Add a socket to the reuseport group of another. * @sk: New socket to add to the group. @@ -101,7 +111,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) */ int reuseport_add_sock(struct sock *sk, struct sock *sk2) { - struct sock_reuseport *reuse; + struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { int err = reuseport_alloc(sk2); @@ -112,10 +122,13 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock)), - "socket already in reuseport group"); + lockdep_is_held(&reuseport_lock)); + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (old_reuse && old_reuse->num_socks != 1) { + spin_unlock_bh(&reuseport_lock); + return -EBUSY; + } if (reuse->num_socks == reuse->max_socks) { reuse = reuseport_grow(reuse); @@ -133,19 +146,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2) spin_unlock_bh(&reuseport_lock); + if (old_reuse) + call_rcu(&old_reuse->rcu, reuseport_free_rcu); return 0; } -static void reuseport_free_rcu(struct rcu_head *head) -{ - struct sock_reuseport *reuse; - - reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); - kfree(reuse); -} - void reuseport_detach_sock(struct sock *sk) { struct sock_reuseport *reuse; From 0a9b2dec6c12e30895ab7478000420ff2e811dce Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 28 Sep 2017 16:58:26 -0500 Subject: [PATCH 185/247] x86/asm: Fix inline asm call constraints for GCC 4.4 commit 520a13c530aeb5f63e011d668c42db1af19ed349 upstream. The kernel test bot (run by Xiaolong Ye) reported that the following commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") is causing double faults in a kernel compiled with GCC 4.4. Linus subsequently diagnosed the crash pattern and the buggy commit and found that the issue is with this code: register unsigned int __asm_call_sp asm("esp"); #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) Even on a 64-bit kernel, it's using ESP instead of RSP. That causes GCC to produce the following bogus code: ffffffff8147461d: 89 e0 mov %esp,%eax ffffffff8147461f: 4c 89 f7 mov %r14,%rdi ffffffff81474622: 4c 89 fe mov %r15,%rsi ffffffff81474625: ba 20 00 00 00 mov $0x20,%edx ffffffff8147462a: 89 c4 mov %eax,%esp ffffffff8147462c: e8 bf 52 05 00 callq ffffffff814c98f0 Despite the absurdity of it backing up and restoring the stack pointer for no reason, the bug is actually the fact that it's only backing up and restoring the lower 32 bits of the stack pointer. The upper 32 bits are getting cleared out, corrupting the stack pointer. So change the '__asm_call_sp' register variable to be associated with the actual full-size stack pointer. This also requires changing the __ASM_SEL() macro to be based on the actual compiled arch size, rather than the CONFIG value, because CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso). Otherwise Clang fails to build the kernel because it complains about the use of a 64-bit register (RSP) in a 32-bit file. Reported-and-Bisected-and-Tested-by: kernel test robot Diagnosed-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: LKP Cc: Linus Torvalds Cc: Matthias Kaehlcke Cc: Miguel Bernal Marin Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble Signed-off-by: Ingo Molnar Cc: Matthias Kaehlcke Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 00523524edbfe..7bb29a416b771 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -11,10 +11,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif From dd7b14c3e05ef49bc2062ccc18a703604063fe1d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 18 Dec 2016 17:44:13 +0100 Subject: [PATCH 186/247] x86/microcode/AMD: Do not load when running on a hypervisor commit a15a753539eca8ba243d576f02e7ca9c4b7d7042 upstream. Doing so is completely void of sense for multiple reasons so prevent it. Set dis_ucode_ldr to true and thus disable the microcode loader by default to address xen pv guests which execute the AP path but not the BSP path. By having it turned off by default, the APs won't run into the loader either. Also, check CPUID(1).ECX[31] which hypervisors set. Well almost, not the xen pv one. That one gets the aforementioned "fix". Also, improve the detection method by caching the final decision whether to continue loading in dis_ucode_ldr and do it once on the BSP. The APs then simply test that value. Signed-off-by: Borislav Petkov Tested-by: Juergen Gross Tested-by: Boris Ostrovsky Acked-by: Juergen Gross Link: http://lkml.kernel.org/r/20161218164414.9649-4-bp@alien8.de Signed-off-by: Thomas Gleixner Cc: Rolf Neugebauer Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/core.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5ce5155f0695a..dc0b9f8763adc 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -43,7 +43,7 @@ #define MICROCODE_VERSION "2.01" static struct microcode_ops *microcode_ops; -static bool dis_ucode_ldr; +static bool dis_ucode_ldr = true; /* * Synchronization. @@ -73,6 +73,7 @@ struct cpu_info_ctx { static bool __init check_loader_disabled_bsp(void) { static const char *__dis_opt_str = "dis_ucode_ldr"; + u32 a, b, c, d; #ifdef CONFIG_X86_32 const char *cmdline = (const char *)__pa_nodebug(boot_command_line); @@ -85,8 +86,23 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (cmdline_find_option_bool(cmdline, option)) - *res = true; + if (!have_cpuid_p()) + return *res; + + a = 1; + c = 0; + native_cpuid(&a, &b, &c, &d); + + /* + * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not + * completely accurate as xen pv guests don't see that CPUID bit set but + * that's good enough as they don't land on the BSP path anyway. + */ + if (c & BIT(31)) + return *res; + + if (cmdline_find_option_bool(cmdline, option) <= 0) + *res = false; return *res; } @@ -118,9 +134,6 @@ void __init load_ucode_bsp(void) if (check_loader_disabled_bsp()) return; - if (!have_cpuid_p()) - return; - vendor = x86_cpuid_vendor(); family = x86_cpuid_family(); @@ -154,9 +167,6 @@ void load_ucode_ap(void) if (check_loader_disabled_ap()) return; - if (!have_cpuid_p()) - return; - vendor = x86_cpuid_vendor(); family = x86_cpuid_family(); From 113d22965c78a79210d4da2d455dc9bcff5e2fb6 Mon Sep 17 00:00:00 2001 From: Jesse Chan Date: Mon, 20 Nov 2017 15:56:28 -0500 Subject: [PATCH 187/247] media: soc_camera: soc_scale_crop: add missing MODULE_DESCRIPTION/AUTHOR/LICENSE commit 5331aec1bf9c9da557668174e0a4bfcee39f1121 upstream. This change resolves a new compile-time warning when built as a loadable module: WARNING: modpost: missing MODULE_LICENSE() in drivers/media/platform/soc_camera/soc_scale_crop.o see include/linux/module.h for more information This adds the license as "GPL", which matches the header of the file. MODULE_DESCRIPTION and MODULE_AUTHOR are also added. Signed-off-by: Jesse Chan Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/soc_camera/soc_scale_crop.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/media/platform/soc_camera/soc_scale_crop.c b/drivers/media/platform/soc_camera/soc_scale_crop.c index f77252d6ccd3a..d29c24854c2c0 100644 --- a/drivers/media/platform/soc_camera/soc_scale_crop.c +++ b/drivers/media/platform/soc_camera/soc_scale_crop.c @@ -418,3 +418,7 @@ void soc_camera_calc_client_output(struct soc_camera_device *icd, mf->height = soc_camera_shift_scale(rect->height, shift, scale_v); } EXPORT_SYMBOL(soc_camera_calc_client_output); + +MODULE_DESCRIPTION("soc-camera scaling-cropping functions"); +MODULE_AUTHOR("Guennadi Liakhovetski "); +MODULE_LICENSE("GPL"); From 0a01ecbd23a9547b60b1a1d7e83f60704b176925 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 May 2017 11:27:52 +0200 Subject: [PATCH 188/247] b43: Add missing MODULE_FIRMWARE() commit 3c89a72ad80c64bdbd5ff851ee9c328a191f7e01 upstream. Some firmware entries were forgotten to be added via MODULE_FIRMWARE(), which may result in the non-functional state when the driver is loaded in initrd. Link: http://bugzilla.opensuse.org/show_bug.cgi?id=1037344 Fixes: 15be8e89cdd9 ("b43: add more bcma cores") Signed-off-by: Takashi Iwai Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/broadcom/b43/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c index 6e5d9095b1956..a635fc6b1722b 100644 --- a/drivers/net/wireless/broadcom/b43/main.c +++ b/drivers/net/wireless/broadcom/b43/main.c @@ -71,8 +71,18 @@ MODULE_FIRMWARE("b43/ucode11.fw"); MODULE_FIRMWARE("b43/ucode13.fw"); MODULE_FIRMWARE("b43/ucode14.fw"); MODULE_FIRMWARE("b43/ucode15.fw"); +MODULE_FIRMWARE("b43/ucode16_lp.fw"); MODULE_FIRMWARE("b43/ucode16_mimo.fw"); +MODULE_FIRMWARE("b43/ucode24_lcn.fw"); +MODULE_FIRMWARE("b43/ucode25_lcn.fw"); +MODULE_FIRMWARE("b43/ucode25_mimo.fw"); +MODULE_FIRMWARE("b43/ucode26_mimo.fw"); +MODULE_FIRMWARE("b43/ucode29_mimo.fw"); +MODULE_FIRMWARE("b43/ucode33_lcn40.fw"); +MODULE_FIRMWARE("b43/ucode30_mimo.fw"); MODULE_FIRMWARE("b43/ucode5.fw"); +MODULE_FIRMWARE("b43/ucode40.fw"); +MODULE_FIRMWARE("b43/ucode42.fw"); MODULE_FIRMWARE("b43/ucode9.fw"); static int modparam_bad_frames_preempt; From 9692602ab850eec484d8cc5a740803d34f00016c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Jun 2017 14:48:18 +0100 Subject: [PATCH 189/247] KEYS: encrypted: fix buffer overread in valid_master_desc() commit 794b4bc292f5d31739d89c0202c54e7dc9bc3add upstream. With the 'encrypted' key type it was possible for userspace to provide a data blob ending with a master key description shorter than expected, e.g. 'keyctl add encrypted desc "new x" @s'. When validating such a master key description, validate_master_desc() could read beyond the end of the buffer. Fix this by using strncmp() instead of memcmp(). [Also clean up the code to deduplicate some logic.] Cc: Mimi Zohar Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: James Morris Signed-off-by: Jin Qian Signed-off-by: Greg Kroah-Hartman --- security/keys/encrypted-keys/encrypted.c | 31 ++++++++++++------------ 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index a871159bf03cc..ead2fd60244d1 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -141,23 +141,22 @@ static int valid_ecryptfs_desc(const char *ecryptfs_desc) */ static int valid_master_desc(const char *new_desc, const char *orig_desc) { - if (!memcmp(new_desc, KEY_TRUSTED_PREFIX, KEY_TRUSTED_PREFIX_LEN)) { - if (strlen(new_desc) == KEY_TRUSTED_PREFIX_LEN) - goto out; - if (orig_desc) - if (memcmp(new_desc, orig_desc, KEY_TRUSTED_PREFIX_LEN)) - goto out; - } else if (!memcmp(new_desc, KEY_USER_PREFIX, KEY_USER_PREFIX_LEN)) { - if (strlen(new_desc) == KEY_USER_PREFIX_LEN) - goto out; - if (orig_desc) - if (memcmp(new_desc, orig_desc, KEY_USER_PREFIX_LEN)) - goto out; - } else - goto out; + int prefix_len; + + if (!strncmp(new_desc, KEY_TRUSTED_PREFIX, KEY_TRUSTED_PREFIX_LEN)) + prefix_len = KEY_TRUSTED_PREFIX_LEN; + else if (!strncmp(new_desc, KEY_USER_PREFIX, KEY_USER_PREFIX_LEN)) + prefix_len = KEY_USER_PREFIX_LEN; + else + return -EINVAL; + + if (!new_desc[prefix_len]) + return -EINVAL; + + if (orig_desc && strncmp(new_desc, orig_desc, prefix_len)) + return -EINVAL; + return 0; -out: - return -EINVAL; } /* From 734e687d1d7bcddf2548eeb5a4935a186f452b69 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 22 Jan 2018 17:09:34 -0500 Subject: [PATCH 190/247] x86/retpoline: Remove the esp/rsp thunk (cherry picked from commit 1df37383a8aeabb9b418698f0bcdffea01f4b1b2) It doesn't make sense to have an indirect call thunk with esp/rsp as retpoline code won't work correctly with the stack pointer register. Removing it will help compiler writers to catch error in case such a thunk call is emitted incorrectly. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Suggested-by: Jeff Law Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Tom Lendacky Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1516658974-27852-1-git-send-email-longman@redhat.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm-prototypes.h | 1 - arch/x86/lib/retpoline.S | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index b15aa4083dfd4..5a25ada75aeb6 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -37,5 +37,4 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) -INDIRECT_THUNK(sp) #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index dfb2ba91b670d..c909961e678a5 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -36,7 +36,6 @@ GENERATE_THUNK(_ASM_DX) GENERATE_THUNK(_ASM_SI) GENERATE_THUNK(_ASM_DI) GENERATE_THUNK(_ASM_BP) -GENERATE_THUNK(_ASM_SP) #ifdef CONFIG_64BIT GENERATE_THUNK(r8) GENERATE_THUNK(r9) From fea3c9a54012227b30e73f5fb3d132c103c7aa84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jan 2018 10:58:13 +0100 Subject: [PATCH 191/247] KVM: x86: Make indirect calls in emulator speculation safe (cherry picked from commit 1a29b5b7f347a1a9230c1e0af5b37e3e571588ab) Replace the indirect calls with CALL_NOSPEC. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: David Woodhouse Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Ashok Raj Cc: Greg KH Cc: Jun Nakajima Cc: David Woodhouse Cc: Linus Torvalds Cc: rga@amazon.de Cc: Dave Hansen Cc: Asit Mallick Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Jason Baron Cc: Paolo Bonzini Cc: Dan Williams Cc: Arjan Van De Ven Cc: Tim Chen Link: https://lkml.kernel.org/r/20180125095843.595615683@infradead.org [dwmw2: Use ASM_CALL_CONSTRAINT like upstream, now we have it] Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/emulate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6f5a3b0763410..c8d573822e60e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "x86.h" #include "tss.h" @@ -1012,8 +1013,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; - asm("push %[flags]; popf; call *%[fastop]" - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + asm("push %[flags]; popf; " CALL_NOSPEC + : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags)); return rc; } @@ -5306,15 +5307,14 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" + asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) + [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); From ec86a1dad0c00757ceb49e86b4c10dd7fbc380cb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jan 2018 10:58:14 +0100 Subject: [PATCH 192/247] KVM: VMX: Make indirect call speculation safe (cherry picked from commit c940a3fb1e2e9b7d03228ab28f375fb5a47ff699) Replace indirect call with CALL_NOSPEC. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: David Woodhouse Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Ashok Raj Cc: Greg KH Cc: Jun Nakajima Cc: David Woodhouse Cc: Linus Torvalds Cc: rga@amazon.de Cc: Dave Hansen Cc: Asit Mallick Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Jason Baron Cc: Paolo Bonzini Cc: Dan Williams Cc: Arjan Van De Ven Cc: Tim Chen Link: https://lkml.kernel.org/r/20180125095843.645776917@infradead.org Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 178a344f55f85..fcdcc88026779 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8676,14 +8676,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #endif "pushf\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" - "call *%[entry]\n\t" + CALL_NOSPEC : #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif "+r"(__sp) : - [entry]"r"(entry), + THUNK_TARGET(entry), [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); From a1745ad92f50e95b6c2d101cd9b77253969ddccc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 25 Jan 2018 15:50:28 -0800 Subject: [PATCH 193/247] module/retpoline: Warn about missing retpoline in module (cherry picked from commit caf7501a1b4ec964190f31f9c3f163de252273b8) There's a risk that a kernel which has full retpoline mitigations becomes vulnerable when a module gets loaded that hasn't been compiled with the right compiler or the right option. To enable detection of that mismatch at module load time, add a module info string "retpoline" at build time when the module was compiled with retpoline support. This only covers compiled C source, but assembler source or prebuilt object files are not checked. If a retpoline enabled kernel detects a non retpoline protected module at load time, print a warning and report it in the sysfs vulnerability file. [ tglx: Massaged changelog ] Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: jeyu@kernel.org Cc: arjan@linux.intel.com Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 17 ++++++++++++++++- include/linux/module.h | 9 +++++++++ kernel/module.c | 11 +++++++++++ scripts/mod/modpost.c | 9 +++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8cacf62ec458c..4cea7d49b3634 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -92,6 +93,19 @@ static const char *spectre_v2_strings[] = { #define pr_fmt(fmt) "Spectre V2 mitigation: " fmt static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +static bool spectre_v2_bad_module; + +#ifdef RETPOLINE +bool retpoline_module_ok(bool has_retpoline) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) + return true; + + pr_err("System may be vunerable to spectre v2\n"); + spectre_v2_bad_module = true; + return false; +} +#endif static void __init spec2_print_if_insecure(const char *reason) { @@ -277,6 +291,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); + return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], + spectre_v2_bad_module ? " - vulnerable module loaded" : ""); } #endif diff --git a/include/linux/module.h b/include/linux/module.h index 0c3207d26ac0c..d2224a09b4b51 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -791,6 +791,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr, static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef RETPOLINE +extern bool retpoline_module_ok(bool has_retpoline); +#else +static inline bool retpoline_module_ok(bool has_retpoline) +{ + return true; +} +#endif + #ifdef CONFIG_MODULE_SIG static inline bool module_sig_ok(struct module *module) { diff --git a/kernel/module.c b/kernel/module.c index 0e54d5bf0097f..07bfb9971f2fb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2817,6 +2817,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) } #endif /* CONFIG_LIVEPATCH */ +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -2969,6 +2978,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); } + check_modinfo_retpoline(mod, info); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); pr_warn("%s: module is from the staging directory, the quality " diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 845eb9b800f39..238db4ffd30c0 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2130,6 +2130,14 @@ static void add_intree_flag(struct buffer *b, int is_intree) buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n"); } +/* Cannot check for assembler */ +static void add_retpoline(struct buffer *b) +{ + buf_printf(b, "\n#ifdef RETPOLINE\n"); + buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n"); + buf_printf(b, "#endif\n"); +} + static void add_staging_flag(struct buffer *b, const char *name) { static const char *staging_dir = "drivers/staging"; @@ -2474,6 +2482,7 @@ int main(int argc, char **argv) add_header(&buf, mod); add_intree_flag(&buf, !external_module); + add_retpoline(&buf); add_staging_flag(&buf, mod->name); err |= add_versions(&buf, mod); add_depends(&buf, mod, modules); From d3eba7744075dc55f367364b5ed2055b5e3d5687 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:09 +0000 Subject: [PATCH 194/247] x86/cpufeatures: Add CPUID_7_EDX CPUID leaf (cherry picked from commit 95ca0ee8636059ea2800dfbac9ecac6212d6b38f) This is a pure feature bits leaf. There are two AVX512 feature bits in it already which were handled as scattered bits, and three more from this leaf are going to be added for speculation control features. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-2-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 7 +++++-- arch/x86/include/asm/cpufeatures.h | 10 ++++++---- arch/x86/include/asm/disabled-features.h | 3 ++- arch/x86/include/asm/required-features.h | 3 ++- arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/cpu/scattered.c | 2 -- 6 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9ea67a04ff4f4..8c101579f535c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -28,6 +28,7 @@ enum cpuid_leafs CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, + CPUID_7_EDX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -78,8 +79,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + BUILD_BUG_ON_ZERO(NCAPINTS != 19)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -100,8 +102,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + BUILD_BUG_ON_ZERO(NCAPINTS != 19)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 8537a21acd8b4..9d4a422d55980 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -12,7 +12,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NCAPINTS 19 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -197,9 +197,7 @@ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ @@ -295,6 +293,10 @@ #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 21c5ac15657b2..1f8cca459c6c8 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -59,6 +59,7 @@ #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) #define DISABLED_MASK17 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) +#define DISABLED_MASK18 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fac9a5c0abe94..6847d85400a8b 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -100,6 +100,7 @@ #define REQUIRED_MASK15 0 #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) +#define REQUIRED_MASK18 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d198ae02f2b7d..426727318ea23 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -737,6 +737,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; + c->x86_capability[CPUID_7_EDX] = edx; } /* Extended state features: level 0x0000000d */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index b0dd9aec183df..afbb52532791f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,8 +31,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, From 40532f65cccc5056b50cf1ab07a9a41445b24aa8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:10 +0000 Subject: [PATCH 195/247] x86/cpufeatures: Add Intel feature bits for Speculation Control (cherry picked from commit fc67dd70adb711a45d2ef34e12d1a8be75edde61) Add three feature bits exposed by new microcode on Intel CPUs for speculation control. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-3-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 9d4a422d55980..1f038882bf324 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -296,6 +296,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ /* * BUG word(s) From c26a6bea26b356fb3539cdf5b2e348a5e528aa7b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:11 +0000 Subject: [PATCH 196/247] x86/cpufeatures: Add AMD feature bits for Speculation Control (cherry picked from commit 5d10cbc91d9eb5537998b65608441b592eec65e7) AMD exposes the PRED_CMD/SPEC_CTRL MSRs slightly differently to Intel. See http://lkml.kernel.org/r/2b3e25cc-286d-8bd0-aeaf-9ac4aae39de8@amd.com Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: Tom Lendacky Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-4-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1f038882bf324..c4d03e70086be 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -258,6 +258,9 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ +#define X86_FEATURE_AMD_PRED_CMD (13*32+12) /* Prediction Command MSR (AMD) */ +#define X86_FEATURE_AMD_SPEC_CTRL (13*32+14) /* Speculation Control MSR only (AMD) */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors (AMD) */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ From af57d43c908fe9b06dc555b400eca68562262eca Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:12 +0000 Subject: [PATCH 197/247] x86/msr: Add definitions for new speculation control MSRs (cherry picked from commit 1e340c60d0dd3ae07b5bedc16a0469c14b9f3410) Add MSR and bit definitions for SPEC_CTRL, PRED_CMD and ARCH_CAPABILITIES. See Intel's 336996-Speculative-Execution-Side-Channel-Mitigations.pdf Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-5-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/msr-index.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b11c4c072df80..c768bc1550a15 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -37,6 +37,13 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ + +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ + #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd @@ -50,6 +57,11 @@ #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) #define MSR_MTRRcap 0x000000fe + +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ + #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e From a8799fd14d9f7f385a5a5c86cde247caf4bb0320 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:13 +0000 Subject: [PATCH 198/247] x86/pti: Do not enable PTI on CPUs which are not vulnerable to Meltdown (cherry picked from commit fec9434a12f38d3aeafeb75711b71d8a1fdef621) Also, for CPUs which don't speculate at all, don't report that they're vulnerable to the Spectre variants either. Leave the cpu_no_meltdown[] match table with just X86_VENDOR_AMD in it for now, even though that could be done with a simple comparison, on the assumption that we'll have more to add. Based on suggestions from Dave Hansen and Alan Cox. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Reviewed-by: Borislav Petkov Acked-by: Dave Hansen Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-6-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 426727318ea23..cfa026f323cff 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -44,6 +44,8 @@ #include #include #include +#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -838,6 +840,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } +static const __initdata struct x86_cpu_id cpu_no_speculation[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_CENTAUR, 5 }, + { X86_VENDOR_INTEL, 5 }, + { X86_VENDOR_NSC, 5 }, + { X86_VENDOR_ANY, 4 }, + {} +}; + +static const __initdata struct x86_cpu_id cpu_no_meltdown[] = { + { X86_VENDOR_AMD }, + {} +}; + +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = 0; + + if (x86_match_cpu(cpu_no_meltdown)) + return false; + + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + /* Rogue Data Cache Load? No! */ + if (ia32_cap & ARCH_CAP_RDCL_NO) + return false; + + return true; +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -884,11 +921,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - if (c->x86_vendor != X86_VENDOR_AMD) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - - setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + if (!x86_match_cpu(cpu_no_speculation)) { + if (cpu_vulnerable_to_meltdown(c)) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + } fpu__init_system(c); From 6c5e49150a51b0210710246b61a972300a274dcd Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:14 +0000 Subject: [PATCH 199/247] x86/cpufeature: Blacklist SPEC_CTRL/PRED_CMD on early Spectre v2 microcodes (cherry picked from commit a5b2966364538a0e68c9fa29bc0a3a1651799035) This doesn't refuse to load the affected microcodes; it just refuses to use the Spectre v2 mitigation features if they're detected, by clearing the appropriate feature bits. The AMD CPUID bits are handled here too, because hypervisors *may* have been exposing those bits even on Intel chips, for fine-grained control of what's available. It is non-trivial to use x86_match_cpu() for this table because that doesn't handle steppings. And the approach taken in commit bd9240a18 almost made me lose my lunch. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-7-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/intel-family.h | 7 ++- arch/x86/kernel/cpu/intel.c | 66 +++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 34a46dc076d36..75b748a1deb89 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -12,6 +12,7 @@ */ #define INTEL_FAM6_CORE_YONAH 0x0E + #define INTEL_FAM6_CORE2_MEROM 0x0F #define INTEL_FAM6_CORE2_MEROM_L 0x16 #define INTEL_FAM6_CORE2_PENRYN 0x17 @@ -21,6 +22,7 @@ #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ #define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_FAM6_NEHALEM_EX 0x2E + #define INTEL_FAM6_WESTMERE 0x25 #define INTEL_FAM6_WESTMERE_EP 0x2C #define INTEL_FAM6_WESTMERE_EX 0x2F @@ -36,9 +38,9 @@ #define INTEL_FAM6_HASWELL_GT3E 0x46 #define INTEL_FAM6_BROADWELL_CORE 0x3D -#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_BROADWELL_GT3E 0x47 #define INTEL_FAM6_BROADWELL_X 0x4F +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 #define INTEL_FAM6_SKYLAKE_MOBILE 0x4E #define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E @@ -57,9 +59,10 @@ #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ +#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A /* Xeon Phi */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fcd484d2bb034..4d23d787a2174 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -61,6 +61,59 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { + u8 model; + u8 stepping; + u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 }, + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 }, + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + /* Updated in the 20180108 release; blacklist until we know otherwise */ + { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 }, + /* Observed in the wild */ + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { + if (c->x86_model == spectre_bad_microcodes[i].model && + c->x86_mask == spectre_bad_microcodes[i].stepping) + return (c->microcode <= spectre_bad_microcodes[i].microcode); + } + return false; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -87,6 +140,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_STIBP) || + cpu_has(c, X86_FEATURE_AMD_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_AMD_PRED_CMD) || + cpu_has(c, X86_FEATURE_AMD_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling SPEC_CTRL\n"); + clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_STIBP); + clear_cpu_cap(c, X86_FEATURE_AMD_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_AMD_PRED_CMD); + clear_cpu_cap(c, X86_FEATURE_AMD_STIBP); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * From 31fd9eda7f69e0be0d0410682fc0d4cd76fe3699 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 25 Jan 2018 16:14:15 +0000 Subject: [PATCH 200/247] x86/speculation: Add basic IBPB (Indirect Branch Prediction Barrier) support (cherry picked from commit 20ffa1caecca4db8f79fe665acdeaa5af815a24d) Expose indirect_branch_prediction_barrier() for use in subsequent patches. [ tglx: Add IBPB status to spectre_v2 sysfs file ] Co-developed-by: KarimAllah Ahmed Signed-off-by: KarimAllah Ahmed Signed-off-by: David Woodhouse Cc: gnomes@lxorguk.ukuu.org.uk Cc: ak@linux.intel.com Cc: ashok.raj@intel.com Cc: dave.hansen@intel.com Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1516896855-7642-8-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/include/asm/nospec-branch.h | 13 +++++++++++++ arch/x86/kernel/cpu/bugs.c | 10 +++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c4d03e70086be..390154576dad8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -202,6 +202,8 @@ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ +#define X86_FEATURE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled*/ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4ad41087ce0e7..34e384c7208f4 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -218,5 +218,18 @@ static inline void vmexit_fill_RSB(void) #endif } +static inline void indirect_branch_prediction_barrier(void) +{ + asm volatile(ALTERNATIVE("", + "movl %[msr], %%ecx\n\t" + "movl %[val], %%eax\n\t" + "movl $0, %%edx\n\t" + "wrmsr", + X86_FEATURE_IBPB) + : : [msr] "i" (MSR_IA32_PRED_CMD), + [val] "i" (PRED_CMD_IBPB) + : "eax", "ecx", "edx", "memory"); +} + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4cea7d49b3634..1c4b39de49bbe 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -262,6 +262,13 @@ static void __init spectre_v2_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Filling RSB on context switch\n"); } + + /* Initialize Indirect Branch Prediction Barrier if supported */ + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) || + boot_cpu_has(X86_FEATURE_AMD_PRED_CMD)) { + setup_force_cpu_cap(X86_FEATURE_IBPB); + pr_info("Enabling Indirect Branch Prediction Barrier\n"); + } } #undef pr_fmt @@ -291,7 +298,8 @@ ssize_t cpu_show_spectre_v2(struct device *dev, if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_IBPB) ? ", IPBP" : "", spectre_v2_bad_module ? " - vulnerable module loaded" : ""); } #endif From 18bc71dff630283333ccea760efb398dbe282a72 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Jan 2018 13:11:37 +0100 Subject: [PATCH 201/247] x86/nospec: Fix header guards names (cherry picked from commit 7a32fc51ca938e67974cbb9db31e1a43f98345a9) ... to adhere to the _ASM_X86_ naming scheme. No functional change. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-3-bp@alien8.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 34e384c7208f4..865192a2cc319 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NOSPEC_BRANCH_H__ -#define __NOSPEC_BRANCH_H__ +#ifndef _ASM_X86_NOSPEC_BRANCH_H_ +#define _ASM_X86_NOSPEC_BRANCH_H_ #include #include @@ -232,4 +232,4 @@ static inline void indirect_branch_prediction_barrier(void) } #endif /* __ASSEMBLY__ */ -#endif /* __NOSPEC_BRANCH_H__ */ +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ From 557cbfa2221110761d6c26085c6df43d48425598 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 26 Jan 2018 13:11:39 +0100 Subject: [PATCH 202/247] x86/bugs: Drop one "mitigation" from dmesg (cherry picked from commit 55fa19d3e51f33d9cd4056d25836d93abf9438db) Make [ 0.031118] Spectre V2 mitigation: Mitigation: Full generic retpoline into [ 0.031118] Spectre V2: Mitigation: Full generic retpoline to reduce the mitigation mitigations strings. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Cc: riel@redhat.com Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: David Woodhouse Cc: jikos@kernel.org Cc: luto@amacapital.net Cc: dave.hansen@intel.com Cc: torvalds@linux-foundation.org Cc: keescook@google.com Cc: Josh Poimboeuf Cc: tim.c.chen@linux.intel.com Cc: pjt@google.com Link: https://lkml.kernel.org/r/20180126121139.31959-5-bp@alien8.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1c4b39de49bbe..674ad4666f85e 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -90,7 +90,7 @@ static const char *spectre_v2_strings[] = { }; #undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; static bool spectre_v2_bad_module; From 98911226d51ea50dd8cb33cfb5a90be39872401b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 Jan 2018 15:45:14 +0100 Subject: [PATCH 203/247] x86/cpu/bugs: Make retpoline module warning conditional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit e383095c7fe8d218e00ec0f83e4b95ed4e627b02) If sysfs is disabled and RETPOLINE not defined: arch/x86/kernel/cpu/bugs.c:97:13: warning: ‘spectre_v2_bad_module’ defined but not used [-Wunused-variable] static bool spectre_v2_bad_module; Hide it. Fixes: caf7501a1b4e ("module/retpoline: Warn about missing retpoline in module") Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: David Woodhouse Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 674ad4666f85e..efe55c58a9215 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -93,9 +93,10 @@ static const char *spectre_v2_strings[] = { #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; -static bool spectre_v2_bad_module; #ifdef RETPOLINE +static bool spectre_v2_bad_module; + bool retpoline_module_ok(bool has_retpoline) { if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) @@ -105,6 +106,13 @@ bool retpoline_module_ok(bool has_retpoline) spectre_v2_bad_module = true; return false; } + +static inline const char *spectre_v2_module_string(void) +{ + return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; +} +#else +static inline const char *spectre_v2_module_string(void) { return ""; } #endif static void __init spec2_print_if_insecure(const char *reason) @@ -299,7 +307,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, return sprintf(buf, "Not affected\n"); return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_IBPB) ? ", IPBP" : "", - spectre_v2_bad_module ? " - vulnerable module loaded" : ""); + boot_cpu_has(X86_FEATURE_IBPB) ? ", IBPB" : "", + spectre_v2_module_string()); } #endif From 77b3b3ee238664df176ed44dd4658e578186c6d3 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 27 Jan 2018 16:24:32 +0000 Subject: [PATCH 204/247] x86/cpufeatures: Clean up Spectre v2 related CPUID flags (cherry picked from commit 2961298efe1ea1b6fc0d7ee8b76018fa6c0bcef2) We want to expose the hardware features simply in /proc/cpuinfo as "ibrs", "ibpb" and "stibp". Since AMD has separate CPUID bits for those, use them as the user-visible bits. When the Intel SPEC_CTRL bit is set which indicates both IBRS and IBPB capability, set those (AMD) bits accordingly. Likewise if the Intel STIBP bit is set, set the AMD STIBP that's used for the generic hardware capability. Hide the rest from /proc/cpuinfo by putting "" in the comments. Including RETPOLINE and RETPOLINE_AMD which shouldn't be visible there. There are patches to make the sysfs vulnerabilities information non-readable by non-root, and the same should apply to all information about which mitigations are actually in use. Those *shouldn't* appear in /proc/cpuinfo. The feature bit for whether IBPB is actually used, which is needed for ALTERNATIVEs, is renamed to X86_FEATURE_USE_IBPB. Originally-by: Borislav Petkov Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1517070274-12128-2-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 18 ++++++++-------- arch/x86/include/asm/nospec-branch.h | 2 +- arch/x86/kernel/cpu/bugs.c | 7 +++---- arch/x86/kernel/cpu/intel.c | 31 +++++++++++++++++++--------- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 390154576dad8..8eb23f5cf7f4e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -194,15 +194,15 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ -#define X86_FEATURE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled*/ +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -260,9 +260,9 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ -#define X86_FEATURE_AMD_PRED_CMD (13*32+12) /* Prediction Command MSR (AMD) */ -#define X86_FEATURE_AMD_SPEC_CTRL (13*32+14) /* Speculation Control MSR only (AMD) */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors (AMD) */ +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -301,8 +301,8 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 865192a2cc319..19ecb5446b302 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -225,7 +225,7 @@ static inline void indirect_branch_prediction_barrier(void) "movl %[val], %%eax\n\t" "movl $0, %%edx\n\t" "wrmsr", - X86_FEATURE_IBPB) + X86_FEATURE_USE_IBPB) : : [msr] "i" (MSR_IA32_PRED_CMD), [val] "i" (PRED_CMD_IBPB) : "eax", "ecx", "edx", "memory"); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index efe55c58a9215..3a06718257bf9 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -272,9 +272,8 @@ static void __init spectre_v2_select_mitigation(void) } /* Initialize Indirect Branch Prediction Barrier if supported */ - if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) || - boot_cpu_has(X86_FEATURE_AMD_PRED_CMD)) { - setup_force_cpu_cap(X86_FEATURE_IBPB); + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); pr_info("Enabling Indirect Branch Prediction Barrier\n"); } } @@ -307,7 +306,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, return sprintf(buf, "Not affected\n"); return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", spectre_v2_module_string()); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 4d23d787a2174..2e257f87c5275 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -140,17 +140,28 @@ static void early_init_intel(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } - if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || - cpu_has(c, X86_FEATURE_STIBP) || - cpu_has(c, X86_FEATURE_AMD_SPEC_CTRL) || - cpu_has(c, X86_FEATURE_AMD_PRED_CMD) || - cpu_has(c, X86_FEATURE_AMD_STIBP)) && bad_spectre_microcode(c)) { - pr_warn("Intel Spectre v2 broken microcode detected; disabling SPEC_CTRL\n"); - clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); + + /* Now if any of them are set, check the blacklist and clear the lot */ + if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); + clear_cpu_cap(c, X86_FEATURE_IBRS); + clear_cpu_cap(c, X86_FEATURE_IBPB); clear_cpu_cap(c, X86_FEATURE_STIBP); - clear_cpu_cap(c, X86_FEATURE_AMD_SPEC_CTRL); - clear_cpu_cap(c, X86_FEATURE_AMD_PRED_CMD); - clear_cpu_cap(c, X86_FEATURE_AMD_STIBP); + clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); + clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP); } /* From 77d1424d2fb84ba2d3ce60a0ec4c3913e2e59abd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 27 Jan 2018 16:24:33 +0000 Subject: [PATCH 205/247] x86/retpoline: Simplify vmexit_fill_RSB() (cherry picked from commit 1dde7415e99933bb7293d6b2843752cbdb43ec11) Simplify it to call an asm-function instead of pasting 41 insn bytes at every call site. Also, add alignment to the macro as suggested here: https://support.google.com/faqs/answer/7625886 [dwmw2: Clean up comments, let it clobber %ebx and just tell the compiler] Signed-off-by: Borislav Petkov Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: dave.hansen@intel.com Cc: karahmed@amazon.de Cc: arjan@linux.intel.com Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: bp@alien8.de Cc: pbonzini@redhat.com Cc: tim.c.chen@linux.intel.com Cc: gregkh@linux-foundation.org Link: https://lkml.kernel.org/r/1517070274-12128-3-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_32.S | 3 +- arch/x86/entry/entry_64.S | 3 +- arch/x86/include/asm/asm-prototypes.h | 3 ++ arch/x86/include/asm/nospec-branch.h | 70 +++------------------------ arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 56 +++++++++++++++++++++ 6 files changed, 71 insertions(+), 65 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a76dc738ec614..f5434b4670c11 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -237,7 +237,8 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + /* Clobbers %ebx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e729e15285841..d8bc3e001df28 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -435,7 +435,8 @@ ENTRY(__switch_to_asm) * exist, overwrite the RSB with entries which capture * speculative execution to prevent attack. */ - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + /* Clobbers %rbx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW #endif /* restore callee-saved registers */ diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 5a25ada75aeb6..166654218329e 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -37,4 +37,7 @@ INDIRECT_THUNK(dx) INDIRECT_THUNK(si) INDIRECT_THUNK(di) INDIRECT_THUNK(bp) +asmlinkage void __fill_rsb(void); +asmlinkage void __clear_rsb(void); + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 19ecb5446b302..df4ececa6ebfb 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,50 +7,6 @@ #include #include -/* - * Fill the CPU return stack buffer. - * - * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; lfence; jmp' loop to capture speculative execution. - * - * This is required in various cases for retpoline and IBRS-based - * mitigations for the Spectre variant 2 vulnerability. Sometimes to - * eliminate potentially bogus entries from the RSB, and sometimes - * purely to ensure that it doesn't get empty, which on some CPUs would - * allow predictions from other (unwanted!) sources to be used. - * - * We define a CPP macro such that it can be used from both .S files and - * inline assembly. It's possible to do a .macro and then include that - * from C via asm(".include ") but let's not go there. - */ - -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ -#define RSB_FILL_LOOPS 16 /* To avoid underflow */ - -/* - * Google experimented with loop-unrolling and this turned out to be - * the optimal version — two calls, each with their own speculation - * trap should their return address end up getting used, in a loop. - */ -#define __FILL_RETURN_BUFFER(reg, nr, sp) \ - mov $(nr/2), reg; \ -771: \ - call 772f; \ -773: /* speculation trap */ \ - pause; \ - lfence; \ - jmp 773b; \ -772: \ - call 774f; \ -775: /* speculation trap */ \ - pause; \ - lfence; \ - jmp 775b; \ -774: \ - dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; - #ifdef __ASSEMBLY__ /* @@ -121,17 +77,10 @@ #endif .endm - /* - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP - * monstrosity above, manually. - */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +/* This clobbers the BX register */ +.macro FILL_RETURN_BUFFER nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr -.Lskip_rsb_\@: + ALTERNATIVE "", "call __clear_rsb", \ftr #endif .endm @@ -206,15 +155,10 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - unsigned long loops; - - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE("jmp 910f", - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), - X86_FEATURE_RETPOLINE) - "910:" - : "=r" (loops), ASM_CALL_CONSTRAINT - : : "memory" ); + alternative_input("", + "call __fill_rsb", + X86_FEATURE_RETPOLINE, + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); #endif } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 6bf1898ddf490..4ad7c4dd311c9 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_RETPOLINE) += retpoline.o +OBJECT_FILES_NON_STANDARD_retpoline.o :=y obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index c909961e678a5..480edc3a5e030 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,6 +7,7 @@ #include #include #include +#include .macro THUNK reg .section .text.__x86.indirect_thunk @@ -46,3 +47,58 @@ GENERATE_THUNK(r13) GENERATE_THUNK(r14) GENERATE_THUNK(r15) #endif + +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; lfence; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * Google experimented with loop-unrolling and this turned out to be + * the optimal version - two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +.macro STUFF_RSB nr:req sp:req + mov $(\nr / 2), %_ASM_BX + .align 16 +771: + call 772f +773: /* speculation trap */ + pause + lfence + jmp 773b + .align 16 +772: + call 774f +775: /* speculation trap */ + pause + lfence + jmp 775b + .align 16 +774: + dec %_ASM_BX + jnz 771b + add $((BITS_PER_LONG/8) * \nr), \sp +.endm + +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +ENTRY(__fill_rsb) + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP + ret +END(__fill_rsb) +EXPORT_SYMBOL_GPL(__fill_rsb) + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ + +ENTRY(__clear_rsb) + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP + ret +END(__clear_rsb) +EXPORT_SYMBOL_GPL(__clear_rsb) From 9eedeb72c4127726b2f3d62b4c7a163f67d721ab Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Tue, 30 Jan 2018 14:13:50 +0800 Subject: [PATCH 206/247] x86/spectre: Check CONFIG_RETPOLINE in command line parser (cherry picked from commit 9471eee9186a46893726e22ebb54cade3f9bc043) The spectre_v2 option 'auto' does not check whether CONFIG_RETPOLINE is enabled. As a consequence it fails to emit the appropriate warning and sets feature flags which have no effect at all. Add the missing IS_ENABLED() check. Fixes: da285121560e ("x86/spectre: Add boot time option to select Spectre v2 mitigation") Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Cc: ak@linux.intel.com Cc: peterz@infradead.org Cc: Tomohiro Cc: dave.hansen@intel.com Cc: bp@alien8.de Cc: arjan@linux.intel.com Cc: dwmw@amazon.co.uk Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/f5892721-7528-3647-08fb-f8d10e65ad87@cn.fujitsu.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3a06718257bf9..51624c6f2a3ba 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -212,10 +212,10 @@ static void __init spectre_v2_select_mitigation(void) return; case SPECTRE_V2_CMD_FORCE: - /* FALLTRHU */ case SPECTRE_V2_CMD_AUTO: - goto retpoline_auto; - + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; case SPECTRE_V2_CMD_RETPOLINE_AMD: if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_amd; From d7f8d17406d62f0c8b20a9100d34d0e203557fe1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 28 Jan 2018 10:38:49 -0800 Subject: [PATCH 207/247] x86/entry/64: Remove the SYSCALL64 fast path (cherry picked from commit 21d375b6b34ff511a507de27bf316b3dde6938d9) The SYCALLL64 fast path was a nice, if small, optimization back in the good old days when syscalls were actually reasonably fast. Now there is PTI to slow everything down, and indirect branches are verboten, making everything messier. The retpoline code in the fast path is particularly nasty. Just get rid of the fast path. The slow path is barely slower. [ tglx: Split out the 'push all extra regs' part ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Borislav Petkov Cc: Linus Torvalds Cc: Kernel Hardening Link: https://lkml.kernel.org/r/462dff8d4d64dfbfc851fbf3130641809d980ecd.1517164461.git.luto@kernel.org Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 123 +----------------------------------- arch/x86/entry/syscall_64.c | 7 +- 2 files changed, 3 insertions(+), 127 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d8bc3e001df28..2c154c980543b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -179,94 +179,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - /* - * If we need to do entry work or if we guess we'll need to do - * exit work, go straight to the slow path. - */ - movq PER_CPU_VAR(current_task), %r11 - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) - jnz entry_SYSCALL64_slow_path - -entry_SYSCALL_64_fastpath: - /* - * Easy case: enable interrupts and issue the syscall. If the syscall - * needs pt_regs, we'll call a stub that disables interrupts again - * and jumps to the slow path. - */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) -#if __SYSCALL_MASK == ~0 - cmpq $__NR_syscall_max, %rax -#else - andl $__SYSCALL_MASK, %eax - cmpl $__NR_syscall_max, %eax -#endif - ja 1f /* return -ENOSYS (already in pt_regs->ax) */ - movq %r10, %rcx - - /* - * This call instruction is handled specially in stub_ptregs_64. - * It might end up jumping to the slow path. If it jumps, RAX - * and all argument registers are clobbered. - */ -#ifdef CONFIG_RETPOLINE - movq sys_call_table(, %rax, 8), %rax - call __x86_indirect_thunk_rax -#else - call *sys_call_table(, %rax, 8) -#endif -.Lentry_SYSCALL_64_after_fastpath_call: - - movq %rax, RAX(%rsp) -1: - - /* - * If we get here, then we know that pt_regs is clean for SYSRET64. - * If we see that no exit work is required (which we are required - * to check with IRQs off), then we can go straight to SYSRET64. - */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - movq PER_CPU_VAR(current_task), %r11 - testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) - jnz 1f - - LOCKDEP_SYS_EXIT - TRACE_IRQS_ON /* user mode is traced as IRQs on */ - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - /* - * This opens a window where we have a user CR3, but are - * running in the kernel. This makes using the CS - * register useless for telling whether or not we need to - * switch CR3 in NMIs. Normal interrupts are OK because - * they are off here. - */ - SWITCH_USER_CR3 - movq RSP(%rsp), %rsp - USERGS_SYSRET64 - -1: - /* - * The fast path looked good when we started, but something changed - * along the way and we need to switch to the slow path. Calling - * raise(3) will trigger this, for example. IRQs are off. - */ - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS - movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ - jmp return_from_SYSCALL_64 - -entry_SYSCALL64_slow_path: /* IRQs are off. */ SAVE_EXTRA_REGS movq %rsp, %rdi call do_syscall_64 /* returns with IRQs disabled */ -return_from_SYSCALL_64: RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ @@ -339,6 +256,7 @@ return_from_SYSCALL_64: syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 + /* * This opens a window where we have a user CR3, but are * running in the kernel. This makes using the CS @@ -363,45 +281,6 @@ opportunistic_sysret_failed: jmp restore_c_regs_and_iret END(entry_SYSCALL_64) -ENTRY(stub_ptregs_64) - /* - * Syscalls marked as needing ptregs land here. - * If we are on the fast path, we need to save the extra regs, - * which we achieve by trying again on the slow path. If we are on - * the slow path, the extra regs are already saved. - * - * RAX stores a pointer to the C function implementing the syscall. - * IRQs are on. - */ - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) - jne 1f - - /* - * Called from fast path -- disable IRQs again, pop return address - * and jump to slow path - */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - popq %rax - jmp entry_SYSCALL64_slow_path - -1: - JMP_NOSPEC %rax /* Called from C */ -END(stub_ptregs_64) - -.macro ptregs_stub func -ENTRY(ptregs_\func) - leaq \func(%rip), %rax - jmp stub_ptregs_64 -END(ptregs_\func) -.endm - -/* Instantiate ptregs_stub for each ptregs-using syscall */ -#define __SYSCALL_64_QUAL_(sym) -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) -#include - /* * %rdi: prev task * %rsi: next task diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 9dbc5abb6162f..6705edda4ac3e 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -6,14 +6,11 @@ #include #include -#define __SYSCALL_64_QUAL_(sym) sym -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym - -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include #undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), +#define __SYSCALL_64(nr, sym, qual) [nr] = sym, extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); From 572e509178112895f6162ead5d4bf39d2b981729 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 28 Jan 2018 10:38:49 -0800 Subject: [PATCH 208/247] x86/entry/64: Push extra regs right away (cherry picked from commit d1f7732009e0549eedf8ea1db948dc37be77fd46) With the fast path removed there is no point in splitting the push of the normal and the extra register set. Just push the extra regs right away. [ tglx: Split out from 'x86/entry/64: Remove the SYSCALL64 fast path' ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Borislav Petkov Cc: Linus Torvalds Cc: Kernel Hardening Link: https://lkml.kernel.org/r/462dff8d4d64dfbfc851fbf3130641809d980ecd.1517164461.git.luto@kernel.org Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2c154c980543b..db5009ce065aa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -177,10 +177,14 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r9 /* pt_regs->r9 */ pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ /* IRQs are off. */ - SAVE_EXTRA_REGS movq %rsp, %rdi call do_syscall_64 /* returns with IRQs disabled */ From f03d00ba0b478963fc96c975d3b27fc1b9bc3a43 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 28 Jan 2018 10:38:50 -0800 Subject: [PATCH 209/247] x86/asm: Move 'status' from thread_struct to thread_info (cherry picked from commit 37a8f7c38339b22b69876d6f5a0ab851565284e3) The TS_COMPAT bit is very hot and is accessed from code paths that mostly also touch thread_info::flags. Move it into struct thread_info to improve cache locality. The only reason it was in thread_struct is that there was a brief period during which arch-specific fields were not allowed in struct thread_info. Linus suggested further changing: ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); to: if (unlikely(ti->status & (TS_COMPAT|TS_I386_REGS_POKED))) ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); on the theory that frequently dirtying the cacheline even in pure 64-bit code that never needs to modify status hurts performance. That could be a reasonable followup patch, but I suspect it matters less on top of this patch. Suggested-by: Linus Torvalds Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Kernel Hardening Link: https://lkml.kernel.org/r/03148bcc1b217100e6e8ecf6a5468c45cf4304b6.1517164461.git.luto@kernel.org Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/common.c | 4 ++-- arch/x86/include/asm/processor.h | 2 -- arch/x86/include/asm/syscall.h | 6 +++--- arch/x86/include/asm/thread_info.h | 3 ++- arch/x86/kernel/process_64.c | 4 ++-- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/signal.c | 2 +- 7 files changed, 11 insertions(+), 12 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bdd9cc59d20f7..bd1d1027384f9 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -201,7 +201,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) * special case only applies after poking regs and before the * very next return to user mode. */ - current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); @@ -299,7 +299,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION - current->thread.status |= TS_COMPAT; + ti->status |= TS_COMPAT; #endif if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 353f038ec6458..cb866ae1bc5d5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -391,8 +391,6 @@ struct thread_struct { unsigned short gsindex; #endif - u32 status; /* thread synchronous flags */ - #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index e3c95e8e61c5f..03eedc21246d5 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task, * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ - if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task->thread.status & TS_COMPAT) + if (task->thread_info.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task, const unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task->thread.status & TS_COMPAT) + if (task->thread_info.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bdf9c4c915722..89978b9c667a6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -54,6 +54,7 @@ struct task_struct; struct thread_info { unsigned long flags; /* low level flags */ + u32 status; /* thread synchronous flags */ }; #define INIT_THREAD_INFO(tsk) \ @@ -213,7 +214,7 @@ static inline int arch_within_stack_frames(const void * const stack, #define in_ia32_syscall() true #else #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \ - current->thread.status & TS_COMPAT) + current_thread_info()->status & TS_COMPAT) #endif /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0887d2ae37979..dffe81d3c2615 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -538,7 +538,7 @@ void set_personality_ia32(bool x32) current->personality &= ~READ_IMPLIES_EXEC; /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ - current->thread.status &= ~TS_COMPAT; + current_thread_info()->status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); @@ -546,7 +546,7 @@ void set_personality_ia32(bool x32) current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ - current->thread.status |= TS_COMPAT; + current_thread_info()->status |= TS_COMPAT; } } EXPORT_SYMBOL_GPL(set_personality_ia32); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0e63c0267f99c..e497d374412af 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - child->thread.status |= TS_I386_REGS_POKED; + child->thread_info.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 763af1d0de64d..b1a5d252d482a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -785,7 +785,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * than the tracee. */ #ifdef CONFIG_IA32_EMULATION - if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI From 899ab2cf91389aa3e27c341a9858dc22985dae2f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 29 Jan 2018 17:02:16 -0800 Subject: [PATCH 210/247] Documentation: Document array_index_nospec (cherry picked from commit f84a56f73dddaeac1dba8045b007f742f61cd2da) Document the rationale and usage of the new array_index_nospec() helper. Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Cc: linux-arch@vger.kernel.org Cc: Jonathan Corbet Cc: Peter Zijlstra Cc: gregkh@linuxfoundation.org Cc: kernel-hardening@lists.openwall.com Cc: torvalds@linux-foundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727413645.33451.15878817161436755393.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- Documentation/speculation.txt | 90 +++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 Documentation/speculation.txt diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt new file mode 100644 index 0000000000000..e9e6cbae28416 --- /dev/null +++ b/Documentation/speculation.txt @@ -0,0 +1,90 @@ +This document explains potential effects of speculation, and how undesirable +effects can be mitigated portably using common APIs. + +=========== +Speculation +=========== + +To improve performance and minimize average latencies, many contemporary CPUs +employ speculative execution techniques such as branch prediction, performing +work which may be discarded at a later stage. + +Typically speculative execution cannot be observed from architectural state, +such as the contents of registers. However, in some cases it is possible to +observe its impact on microarchitectural state, such as the presence or +absence of data in caches. Such state may form side-channels which can be +observed to extract secret information. + +For example, in the presence of branch prediction, it is possible for bounds +checks to be ignored by code which is speculatively executed. Consider the +following code: + + int load_array(int *array, unsigned int index) + { + if (index >= MAX_ARRAY_ELEMS) + return 0; + else + return array[index]; + } + +Which, on arm64, may be compiled to an assembly sequence such as: + + CMP , #MAX_ARRAY_ELEMS + B.LT less + MOV , #0 + RET + less: + LDR , [, ] + RET + +It is possible that a CPU mis-predicts the conditional branch, and +speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This +value will subsequently be discarded, but the speculated load may affect +microarchitectural state which can be subsequently measured. + +More complex sequences involving multiple dependent memory accesses may +result in sensitive information being leaked. Consider the following +code, building on the prior example: + + int load_dependent_arrays(int *arr1, int *arr2, int index) + { + int val1, val2, + + val1 = load_array(arr1, index); + val2 = load_array(arr2, val1); + + return val2; + } + +Under speculation, the first call to load_array() may return the value +of an out-of-bounds address, while the second call will influence +microarchitectural state dependent on this value. This may provide an +arbitrary read primitive. + +==================================== +Mitigating speculation side-channels +==================================== + +The kernel provides a generic API to ensure that bounds checks are +respected even under speculation. Architectures which are affected by +speculation-based side-channels are expected to implement these +primitives. + +The array_index_nospec() helper in can be used to +prevent information from being leaked via side-channels. + +A call to array_index_nospec(index, size) returns a sanitized index +value that is bounded to [0, size) even under cpu speculation +conditions. + +This can be used to protect the earlier load_array() example: + + int load_array(int *array, unsigned int index) + { + if (index >= MAX_ARRAY_ELEMS) + return 0; + else { + index = array_index_nospec(index, MAX_ARRAY_ELEMS); + return array[index]; + } + } From 579ef9ea20d67a80a0bc5d093259ff6e97087d59 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:22 -0800 Subject: [PATCH 211/247] array_index_nospec: Sanitize speculative array de-references (cherry picked from commit f3804203306e098dae9ca51540fcd5eb700d7f40) array_index_nospec() is proposed as a generic mechanism to mitigate against Spectre-variant-1 attacks, i.e. an attack that bypasses boundary checks via speculative execution. The array_index_nospec() implementation is expected to be safe for current generation CPUs across multiple architectures (ARM, x86). Based on an original implementation by Linus Torvalds, tweaked to remove speculative flows by Alexei Starovoitov, and tweaked again by Linus to introduce an x86 assembly implementation for the mask generation. Co-developed-by: Linus Torvalds Co-developed-by: Alexei Starovoitov Suggested-by: Cyril Novikov Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727414229.33451.18411580953862676575.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- include/linux/nospec.h | 72 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 include/linux/nospec.h diff --git a/include/linux/nospec.h b/include/linux/nospec.h new file mode 100644 index 0000000000000..b99bced39ac2f --- /dev/null +++ b/include/linux/nospec.h @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Linus Torvalds. All rights reserved. +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#ifndef _LINUX_NOSPEC_H +#define _LINUX_NOSPEC_H + +/** + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * When @index is out of bounds (@index >= @size), the sign bit will be + * set. Extend the sign bit to all bits and invert, giving a result of + * zero for an out of bounds index, or ~0 if within bounds [0, @size). + */ +#ifndef array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + /* + * Warn developers about inappropriate array_index_nospec() usage. + * + * Even if the CPU speculates past the WARN_ONCE branch, the + * sign bit of @index is taken into account when generating the + * mask. + * + * This warning is compiled out when the compiler can infer that + * @index and @size are less than LONG_MAX. + */ + if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX, + "array_index_nospec() limited to range of [0, LONG_MAX]\n")) + return 0; + + /* + * Always calculate and emit the mask even if the compiler + * thinks the mask is not needed. The compiler does not take + * into account the value of @index under speculation. + */ + OPTIMIZER_HIDE_VAR(index); + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); +} +#endif + +/* + * array_index_nospec - sanitize an array index after a bounds check + * + * For a code sequence like: + * + * if (index < size) { + * index = array_index_nospec(index, size); + * val = array[index]; + * } + * + * ...if the CPU speculates past the bounds check then + * array_index_nospec() will clamp the index within the range of [0, + * size). + */ +#define array_index_nospec(index, size) \ +({ \ + typeof(index) _i = (index); \ + typeof(size) _s = (size); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ + \ + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ + \ + _i &= _mask; \ + _i; \ +}) +#endif /* _LINUX_NOSPEC_H */ From 8c33e2d23a6821cb7d608011c3d2f54accf4212c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:28 -0800 Subject: [PATCH 212/247] x86: Implement array_index_mask_nospec (cherry picked from commit babdde2698d482b6c0de1eab4f697cf5856c5859) array_index_nospec() uses a mask to sanitize user controllable array indexes, i.e. generate a 0 mask if 'index' >= 'size', and a ~0 mask otherwise. While the default array_index_mask_nospec() handles the carry-bit from the (index - size) result in software. The x86 array_index_mask_nospec() does the same, but the carry-bit is handled in the processor CF flag without conditional instructions in the control flow. Suggested-by: Linus Torvalds Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727414808.33451.1873237130672785331.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/barrier.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index bfb28caf97b1b..ca22173e42dc3 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -23,6 +23,30 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif +/** + * array_index_mask_nospec() - generate a mask that is ~0UL when the + * bounds check succeeds and 0 otherwise + * @index: array element index + * @size: number of elements in array + * + * Returns: + * 0 - (index < size) + */ +static inline unsigned long array_index_mask_nospec(unsigned long index, + unsigned long size) +{ + unsigned long mask; + + asm ("cmp %1,%2; sbb %0,%0;" + :"=r" (mask) + :"r"(size),"r" (index) + :"cc"); + return mask; +} + +/* Override the default implementation from linux/nospec.h. */ +#define array_index_mask_nospec array_index_mask_nospec + #ifdef CONFIG_X86_PPRO_FENCE #define dma_rmb() rmb() #else From 1f03d140e2f509ae27f21d229930e707436a07ac Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:33 -0800 Subject: [PATCH 213/247] x86: Introduce barrier_nospec (cherry picked from commit b3d7ad85b80bbc404635dca80f5b129f6242bc7a) Rename the open coded form of this instruction sequence from rdtsc_ordered() into a generic barrier primitive, barrier_nospec(). One of the mitigations for Spectre variant1 vulnerabilities is to fence speculative execution after successfully validating a bounds check. I.e. force the result of a bounds check to resolve in the instruction pipeline to ensure speculative execution honors that result before potentially operating on out-of-bounds data. No functional changes. Suggested-by: Linus Torvalds Suggested-by: Andi Kleen Suggested-by: Ingo Molnar Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Tom Lendacky Cc: Kees Cook Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727415361.33451.9049453007262764675.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/barrier.h | 4 ++++ arch/x86/include/asm/msr.h | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index ca22173e42dc3..857590390397a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -47,6 +47,10 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, /* Override the default implementation from linux/nospec.h. */ #define array_index_mask_nospec array_index_mask_nospec +/* Prevent speculative execution past this barrier. */ +#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ + "lfence", X86_FEATURE_LFENCE_RDTSC) + #ifdef CONFIG_X86_PPRO_FENCE #define dma_rmb() rmb() #else diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index b5fee97813cdf..ed35b915b5c97 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -188,8 +188,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) * that some other imaginary CPU is updating continuously with a * time stamp. */ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); + barrier_nospec(); return rdtsc(); } From e06d7bfb223e6babd7a7f8586e9c25b87272b841 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:39 -0800 Subject: [PATCH 214/247] x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec (cherry picked from commit b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd) For __get_user() paths, do not allow the kernel to speculate on the value of a user controlled pointer. In addition to the 'stac' instruction for Supervisor Mode Access Protection (SMAP), a barrier_nospec() causes the access_ok() result to resolve in the pipeline before the CPU might take any speculative action on the pointer value. Given the cost of 'stac' the speculation barrier is placed after 'stac' to hopefully overlap the cost of disabling SMAP with the cost of flushing the instruction pipeline. Since __get_user is a major kernel interface that deals with user controlled pointers, the __uaccess_begin_nospec() mechanism will prevent speculative execution past an access_ok() permission check. While speculative execution past access_ok() is not enough to lead to a kernel memory leak, it is a necessary precondition. To be clear, __uaccess_begin_nospec() is addressing a class of potential problems near __get_user() usages. Note, that while the barrier_nospec() in __uaccess_begin_nospec() is used to protect __get_user(), pointer masking similar to array_index_nospec() will be used for get_user() since it incorporates a bounds check near the usage. uaccess_try_nospec provides the same mechanism for get_user_try. No functional changes. Suggested-by: Linus Torvalds Suggested-by: Andi Kleen Suggested-by: Ingo Molnar Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Tom Lendacky Cc: Kees Cook Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727415922.33451.5796614273104346583.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/uaccess.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index dead0f3921f33..43148457d2556 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -123,6 +123,11 @@ extern int __get_user_bad(void); #define __uaccess_begin() stac() #define __uaccess_end() clac() +#define __uaccess_begin_nospec() \ +({ \ + stac(); \ + barrier_nospec(); \ +}) /* * This is a type: either unsigned long, if the argument fits into @@ -474,6 +479,10 @@ struct __large_struct { unsigned long buf[100]; }; __uaccess_begin(); \ barrier(); +#define uaccess_try_nospec do { \ + current->thread.uaccess_err = 0; \ + __uaccess_begin_nospec(); \ + #define uaccess_catch(err) \ __uaccess_end(); \ (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \ From ae75f83e79e4b2de7981d34ddcceb54178098bea Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:44 -0800 Subject: [PATCH 215/247] x86/usercopy: Replace open coded stac/clac with __uaccess_{begin, end} (cherry picked from commit b5c4ae4f35325d520b230bab6eb3310613b72ac1) In preparation for converting some __uaccess_begin() instances to __uacess_begin_nospec(), make sure all 'from user' uaccess paths are using the _begin(), _end() helpers rather than open-coded stac() and clac(). No functional changes. Suggested-by: Ingo Molnar Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Tom Lendacky Cc: Kees Cook Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: torvalds@linux-foundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727416438.33451.17309465232057176966.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/usercopy_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 3bc7baf2a711f..9b5fa0f6964e6 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -570,12 +570,12 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { - stac(); + __uaccess_begin(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else n = __copy_user_intel(to, from, n); - clac(); + __uaccess_end(); return n; } EXPORT_SYMBOL(__copy_to_user_ll); @@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { - stac(); + __uaccess_begin(); #ifdef CONFIG_X86_INTEL_USERCOPY if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_intel_nocache(to, from, n); @@ -636,7 +636,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr #else __copy_user(to, from, n); #endif - clac(); + __uaccess_end(); return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); From 065eae4be83d8d2bea325c11610ebaf836e1cebb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:49 -0800 Subject: [PATCH 216/247] x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec (cherry picked from commit 304ec1b050310548db33063e567123fae8fd0301) Quoting Linus: I do think that it would be a good idea to very expressly document the fact that it's not that the user access itself is unsafe. I do agree that things like "get_user()" want to be protected, but not because of any direct bugs or problems with get_user() and friends, but simply because get_user() is an excellent source of a pointer that is obviously controlled from a potentially attacking user space. So it's a prime candidate for then finding _subsequent_ accesses that can then be used to perturb the cache. __uaccess_begin_nospec() covers __get_user() and copy_from_iter() where the limit check is far away from the user pointer de-reference. In those cases a barrier_nospec() prevents speculation with a potential pointer to privileged memory. uaccess_try_nospec covers get_user_try. Suggested-by: Linus Torvalds Suggested-by: Andi Kleen Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Kees Cook Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727416953.33451.10508284228526170604.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/uaccess.h | 6 +++--- arch/x86/include/asm/uaccess_32.h | 12 ++++++------ arch/x86/include/asm/uaccess_64.h | 12 ++++++------ arch/x86/lib/usercopy_32.c | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 43148457d2556..a8d85a687cf46 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -437,7 +437,7 @@ do { \ ({ \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ - __uaccess_begin(); \ + __uaccess_begin_nospec(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ @@ -547,7 +547,7 @@ struct __large_struct { unsigned long buf[100]; }; * get_user_ex(...); * } get_user_catch(err) */ -#define get_user_try uaccess_try +#define get_user_try uaccess_try_nospec #define get_user_catch(err) uaccess_catch(err) #define get_user_ex(x, ptr) do { \ @@ -582,7 +582,7 @@ extern void __cmpxchg_wrong_size(void) __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ - __uaccess_begin(); \ + __uaccess_begin_nospec(); \ switch (size) { \ case 1: \ { \ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 7d3bdd1ed6977..d6d245088dd56 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -102,17 +102,17 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) switch (n) { case 1: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_size(*(u8 *)to, from, 1, ret, 1); __uaccess_end(); return ret; case 2: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_size(*(u16 *)to, from, 2, ret, 2); __uaccess_end(); return ret; case 4: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_size(*(u32 *)to, from, 4, ret, 4); __uaccess_end(); return ret; @@ -130,17 +130,17 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to, switch (n) { case 1: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_size(*(u8 *)to, from, 1, ret, 1); __uaccess_end(); return ret; case 2: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_size(*(u16 *)to, from, 2, ret, 2); __uaccess_end(); return ret; case 4: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_size(*(u32 *)to, from, 4, ret, 4); __uaccess_end(); return ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 673059a109fee..6e5cc08134bab 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -59,31 +59,31 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) return copy_user_generic(dst, (__force void *)src, size); switch (size) { case 1: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_asm(*(u8 *)dst, (u8 __user *)src, ret, "b", "b", "=q", 1); __uaccess_end(); return ret; case 2: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_asm(*(u16 *)dst, (u16 __user *)src, ret, "w", "w", "=r", 2); __uaccess_end(); return ret; case 4: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_asm(*(u32 *)dst, (u32 __user *)src, ret, "l", "k", "=r", 4); __uaccess_end(); return ret; case 8: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 8); __uaccess_end(); return ret; case 10: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 10); if (likely(!ret)) @@ -93,7 +93,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) __uaccess_end(); return ret; case 16: - __uaccess_begin(); + __uaccess_begin_nospec(); __get_user_asm(*(u64 *)dst, (u64 __user *)src, ret, "q", "", "=r", 16); if (likely(!ret)) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9b5fa0f6964e6..5c06dbffc52f3 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -570,7 +570,7 @@ do { \ unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) { - __uaccess_begin(); + __uaccess_begin_nospec(); if (movsl_is_ok(to, from, n)) __copy_user(to, from, n); else @@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache); unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, unsigned long n) { - __uaccess_begin(); + __uaccess_begin_nospec(); #ifdef CONFIG_X86_INTEL_USERCOPY if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) n = __copy_user_intel_nocache(to, from, n); From 398a39311c0b67ff1d886f9861ae5251f8a7cad4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:54 -0800 Subject: [PATCH 217/247] x86/get_user: Use pointer masking to limit speculation (cherry picked from commit c7f631cb07e7da06ac1d231ca178452339e32a94) Quoting Linus: I do think that it would be a good idea to very expressly document the fact that it's not that the user access itself is unsafe. I do agree that things like "get_user()" want to be protected, but not because of any direct bugs or problems with get_user() and friends, but simply because get_user() is an excellent source of a pointer that is obviously controlled from a potentially attacking user space. So it's a prime candidate for then finding _subsequent_ accesses that can then be used to perturb the cache. Unlike the __get_user() case get_user() includes the address limit check near the pointer de-reference. With that locality the speculation can be mitigated with pointer narrowing rather than a barrier, i.e. array_index_nospec(). Where the narrowing is performed by: cmp %limit, %ptr sbb %mask, %mask and %mask, %ptr With respect to speculation the value of %ptr is either less than %limit or NULL. Co-developed-by: Linus Torvalds Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Kees Cook Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: Andy Lutomirski Cc: torvalds@linux-foundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727417469.33451.11804043010080838495.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/getuser.S | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 37b62d4121481..b12b214713a60 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -39,6 +39,8 @@ ENTRY(__get_user_1) mov PER_CPU_VAR(current_task), %_ASM_DX cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ + and %_ASM_DX, %_ASM_AX ASM_STAC 1: movzbl (%_ASM_AX),%edx xor %eax,%eax @@ -53,6 +55,8 @@ ENTRY(__get_user_2) mov PER_CPU_VAR(current_task), %_ASM_DX cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ + and %_ASM_DX, %_ASM_AX ASM_STAC 2: movzwl -1(%_ASM_AX),%edx xor %eax,%eax @@ -67,6 +71,8 @@ ENTRY(__get_user_4) mov PER_CPU_VAR(current_task), %_ASM_DX cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ + and %_ASM_DX, %_ASM_AX ASM_STAC 3: movl -3(%_ASM_AX),%edx xor %eax,%eax @@ -82,6 +88,8 @@ ENTRY(__get_user_8) mov PER_CPU_VAR(current_task), %_ASM_DX cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ + and %_ASM_DX, %_ASM_AX ASM_STAC 4: movq -7(%_ASM_AX),%rdx xor %eax,%eax @@ -93,6 +101,8 @@ ENTRY(__get_user_8) mov PER_CPU_VAR(current_task), %_ASM_DX cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user_8 + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ + and %_ASM_DX, %_ASM_AX ASM_STAC 4: movl -7(%_ASM_AX),%edx 5: movl -3(%_ASM_AX),%ecx From c3193fd49f6f0a9b378379f8fd3a95f01172d477 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:02:59 -0800 Subject: [PATCH 218/247] x86/syscall: Sanitize syscall table de-references under speculation (cherry picked from commit 2fbd7af5af8665d18bcefae3e9700be07e22b681) The syscall table base is a user controlled function pointer in kernel space. Use array_index_nospec() to prevent any out of bounds speculation. While retpoline prevents speculating into a userspace directed target it does not stop the pointer de-reference, the concern is leaking memory relative to the syscall table base, by observing instruction cache behavior. Reported-by: Linus Torvalds Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Andy Lutomirski Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727417984.33451.1216731042505722161.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bd1d1027384f9..b0cd306dc527e 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -277,7 +278,8 @@ __visible void do_syscall_64(struct pt_regs *regs) * regs->orig_ax, which changes the behavior of some syscalls. */ if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { - regs->ax = sys_call_table[nr & __SYSCALL_MASK]( + nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls); + regs->ax = sys_call_table[nr]( regs->di, regs->si, regs->dx, regs->r10, regs->r8, regs->r9); } @@ -313,6 +315,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) } if (likely(nr < IA32_NR_syscalls)) { + nr = array_index_nospec(nr, IA32_NR_syscalls); /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that From c26ceec69576cb61157d2487812fb2776e125260 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:03:05 -0800 Subject: [PATCH 219/247] vfs, fdtable: Prevent bounds-check bypass via speculative execution (cherry picked from commit 56c30ba7b348b90484969054d561f711ba196507) 'fd' is a user controlled value that is used as a data dependency to read from the 'fdt->fd' array. In order to avoid potential leaks of kernel memory values, block speculative execution of the instruction stream that could issue reads based on an invalid 'file *' returned from __fcheck_files. Co-developed-by: Elena Reshetova Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: Al Viro Cc: torvalds@linux-foundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727418500.33451.17392199002892248656.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- include/linux/fdtable.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 6e84b2cae6ad6..442b54a14cbc0 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -81,8 +82,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i { struct fdtable *fdt = rcu_dereference_raw(files->fdt); - if (fd < fdt->max_fds) + if (fd < fdt->max_fds) { + fd = array_index_nospec(fd, fdt->max_fds); return rcu_dereference_raw(fdt->fd[fd]); + } return NULL; } From 0781a50a30d37230147954a3dd15bbfcebfc2398 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:03:15 -0800 Subject: [PATCH 220/247] nl80211: Sanitize array index in parse_txq_params (cherry picked from commit 259d8c1e984318497c84eef547bbb6b1d9f4eb05) Wireless drivers rely on parse_txq_params to validate that txq_params->ac is less than NL80211_NUM_ACS by the time the low-level driver's ->conf_tx() handler is called. Use a new helper, array_index_nospec(), to sanitize txq_params->ac with respect to speculation. I.e. ensure that any speculation into ->conf_tx() handlers is done with a value of txq_params->ac that is within the bounds of [0, NL80211_NUM_ACS). Reported-by: Christian Lamparter Reported-by: Elena Reshetova Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Acked-by: Johannes Berg Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: linux-wireless@vger.kernel.org Cc: torvalds@linux-foundation.org Cc: "David S. Miller" Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727419584.33451.7700736761686184303.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c626f679e1c87..91722e97cdd57 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -2014,20 +2015,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = { static int parse_txq_params(struct nlattr *tb[], struct ieee80211_txq_params *txq_params) { + u8 ac; + if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] || !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] || !tb[NL80211_TXQ_ATTR_AIFS]) return -EINVAL; - txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]); + ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]); txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]); txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]); txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]); txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]); - if (txq_params->ac >= NL80211_NUM_ACS) + if (ac >= NL80211_NUM_ACS) return -EINVAL; - + txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS); return 0; } From 359fde6bd0ec91c284641965e5c5c45a1fab6d51 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 Jan 2018 17:03:21 -0800 Subject: [PATCH 221/247] x86/spectre: Report get_user mitigation for spectre_v1 (cherry picked from commit edfbae53dab8348fca778531be9f4855d2ca0360) Reflect the presence of get_user(), __get_user(), and 'syscall' protections in sysfs. The expectation is that new and better tooling will allow the kernel to grow more usages of array_index_nospec(), for now, only claim mitigation for __user pointer de-references. Reported-by: Jiri Slaby Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: kernel-hardening@lists.openwall.com Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: alan@linux.intel.com Link: https://lkml.kernel.org/r/151727420158.33451.11658324346540434635.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 51624c6f2a3ba..d4658e02ff93f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -296,7 +296,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev, { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Vulnerable\n"); + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); } ssize_t cpu_show_spectre_v2(struct device *dev, From f67e05d1506a24f329589c27952d776f977161b7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 30 Jan 2018 19:32:18 +0000 Subject: [PATCH 222/247] x86/spectre: Fix spelling mistake: "vunerable"-> "vulnerable" (cherry picked from commit e698dcdfcda41efd0984de539767b4cddd235f1e) Trivial fix to spelling mistake in pr_err error message text. Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: kernel-janitors@vger.kernel.org Cc: Andy Lutomirski Cc: Borislav Petkov Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180130193218.9271-1-colin.king@canonical.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d4658e02ff93f..aec7dafc1f138 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -102,7 +102,7 @@ bool retpoline_module_ok(bool has_retpoline) if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) return true; - pr_err("System may be vunerable to spectre v2\n"); + pr_err("System may be vulnerable to spectre v2\n"); spectre_v2_bad_module = true; return false; } From cda6b6074cc6f94ba4cb69a6c8928c1cd19fe291 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 30 Jan 2018 14:30:23 +0000 Subject: [PATCH 223/247] x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel (cherry picked from commit 7fcae1118f5fd44a862aa5c3525248e35ee67c3b) Despite the fact that all the other code there seems to be doing it, just using set_cpu_cap() in early_intel_init() doesn't actually work. For CPUs with PKU support, setup_pku() calls get_cpu_cap() after c->c_init() has set those feature bits. That resets those bits back to what was queried from the hardware. Turning the bits off for bad microcode is easy to fix. That can just use setup_clear_cpu_cap() to force them off for all CPUs. I was less keen on forcing the feature bits *on* that way, just in case of inconsistencies. I appreciate that the kernel is going to get this utterly wrong if CPU features are not consistent, because it has already applied alternatives by the time secondary CPUs are brought up. But at least if setup_force_cpu_cap() isn't being used, we might have a chance of *detecting* the lack of the corresponding bit and either panicking or refusing to bring the offending CPU online. So ensure that the appropriate feature bits are set within get_cpu_cap() regardless of how many extra times it's called. Fixes: 2961298e ("x86/cpufeatures: Clean up Spectre v2 related CPUID flags") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: karahmed@amazon.de Cc: peterz@infradead.org Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1517322623-15261-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++ arch/x86/kernel/cpu/intel.c | 27 ++++++++------------------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cfa026f323cff..60e537dac7188 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -718,6 +718,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) } } +static void init_speculation_control(struct cpuinfo_x86 *c) +{ + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + * + * We use the AMD bits in 0x8000_0008 EBX as the generic hardware + * features, which are visible in /proc/cpuinfo and used by the + * kernel. So set those accordingly from the Intel bits. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -812,6 +832,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); + init_speculation_control(c); } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2e257f87c5275..4097b43cba2da 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -140,28 +140,17 @@ static void early_init_intel(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); } - /* - * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, - * and they also have a different bit for STIBP support. Also, - * a hypervisor might have set the individual AMD bits even on - * Intel CPUs, for finer-grained selection of what's available. - */ - if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { - set_cpu_cap(c, X86_FEATURE_IBRS); - set_cpu_cap(c, X86_FEATURE_IBPB); - } - if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) - set_cpu_cap(c, X86_FEATURE_STIBP); - /* Now if any of them are set, check the blacklist and clear the lot */ - if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_INTEL_STIBP) || + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); - clear_cpu_cap(c, X86_FEATURE_IBRS); - clear_cpu_cap(c, X86_FEATURE_IBPB); - clear_cpu_cap(c, X86_FEATURE_STIBP); - clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); - clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_IBRS); + setup_clear_cpu_cap(X86_FEATURE_IBPB); + setup_clear_cpu_cap(X86_FEATURE_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); } /* From 7552556f65af7db6b2bad828beb4c633cb7d8533 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 30 Jan 2018 22:13:33 -0600 Subject: [PATCH 224/247] x86/paravirt: Remove 'noreplace-paravirt' cmdline option (cherry picked from commit 12c69f1e94c89d40696e83804dd2f0965b5250cd) The 'noreplace-paravirt' option disables paravirt patching, leaving the original pv indirect calls in place. That's highly incompatible with retpolines, unless we want to uglify paravirt even further and convert the paravirt calls to retpolines. As far as I can tell, the option doesn't seem to be useful for much other than introducing surprising corner cases and making the kernel vulnerable to Spectre v2. It was probably a debug option from the early paravirt days. So just remove it. Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: Andrea Arcangeli Cc: Peter Zijlstra Cc: Andi Kleen Cc: Ashok Raj Cc: Greg KH Cc: Jun Nakajima Cc: Tim Chen Cc: Rusty Russell Cc: Dave Hansen Cc: Asit Mallick Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jason Baron Cc: Paolo Bonzini Cc: Alok Kataria Cc: Arjan Van De Ven Cc: David Woodhouse Cc: Dan Williams Link: https://lkml.kernel.org/r/20180131041333.2x6blhxirc2kclrq@treble Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 2 -- arch/x86/kernel/alternative.c | 14 -------------- 2 files changed, 16 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4c2667aa4634b..466c039c622b2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2805,8 +2805,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space - noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops - noreplace-smp [X86-32,SMP] Don't replace SMP instructions with UP alternatives diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 10d5a3d6affc4..03b6e5c6cf231 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); -#ifdef CONFIG_PARAVIRT -static int __initdata_or_module noreplace_paravirt = 0; - -static int __init setup_noreplace_paravirt(char *str) -{ - noreplace_paravirt = 1; - return 1; -} -__setup("noreplace-paravirt", setup_noreplace_paravirt); -#endif - #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ @@ -588,9 +577,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *p; char insnbuf[MAX_PATCH_LEN]; - if (noreplace_paravirt) - return; - for (p = start; p < end; p++) { unsigned int used; From eb99bd6341cb3039d0b9d2b7f89d5f2ff98e9676 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 31 Jan 2018 17:47:03 -0800 Subject: [PATCH 225/247] x86/kvm: Update spectre-v1 mitigation (cherry picked from commit 085331dfc6bbe3501fb936e657331ca943827600) Commit 75f139aaf896 "KVM: x86: Add memory barrier on vmcs field lookup" added a raw 'asm("lfence");' to prevent a bounds check bypass of 'vmcs_field_to_offset_table'. The lfence can be avoided in this path by using the array_index_nospec() helper designed for these types of fixes. Signed-off-by: Dan Williams Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Andrew Honig Cc: kvm@vger.kernel.org Cc: Jim Mattson Link: https://lkml.kernel.org/r/151744959670.6342.3001723920950249067.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fcdcc88026779..feadff3121314 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -856,21 +857,18 @@ static const unsigned short vmcs_field_to_offset_table[] = { static inline short vmcs_field_to_offset(unsigned long field) { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); + unsigned short offset; - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + BUILD_BUG_ON(size > SHRT_MAX); + if (field >= size) return -ENOENT; - /* - * FIXME: Mitigation for CVE-2017-5753. To be replaced with a - * generic mechanism. - */ - asm("lfence"); - - if (vmcs_field_to_offset_table[field] == 0) + field = array_index_nospec(field, size); + offset = vmcs_field_to_offset_table[field]; + if (offset == 0) return -ENOENT; - - return vmcs_field_to_offset_table[field]; + return offset; } static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) From fe4333893936daf7c02c3b8afc379bc600b5e286 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 1 Feb 2018 11:27:20 +0000 Subject: [PATCH 226/247] x86/retpoline: Avoid retpolines for built-in __init functions (cherry picked from commit 66f793099a636862a71c59d4a6ba91387b155e0c) There's no point in building init code with retpolines, since it runs before any potentially hostile userspace does. And before the retpoline is actually ALTERNATIVEd into place, for much of it. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: karahmed@amazon.de Cc: peterz@infradead.org Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1517484441-1420-2-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- include/linux/init.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/init.h b/include/linux/init.h index e30104ceb86dc..8e346d1bd8375 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -4,6 +4,13 @@ #include #include +/* Built-in __init functions needn't be compiled with retpoline */ +#if defined(RETPOLINE) && !defined(MODULE) +#define __noretpoline __attribute__((indirect_branch("keep"))) +#else +#define __noretpoline +#endif + /* These macros are used to mark some functions or * initialized data (doesn't apply to uninitialized data) * as `initialization' functions. The kernel can take this @@ -39,7 +46,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold notrace __latent_entropy +#define __init __section(.init.text) __cold notrace __latent_entropy __noretpoline #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) From 961cb14c615d13a823963585dafdc19ee9e9ba85 Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed Date: Thu, 1 Feb 2018 11:27:21 +0000 Subject: [PATCH 227/247] x86/spectre: Simplify spectre_v2 command line parsing (cherry picked from commit 9005c6834c0ffdfe46afa76656bd9276cca864f6) [dwmw2: Use ARRAY_SIZE] Signed-off-by: KarimAllah Ahmed Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: bp@alien8.de Link: https://lkml.kernel.org/r/1517484441-1420-3-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 86 +++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index aec7dafc1f138..957ad443b7861 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -118,13 +118,13 @@ static inline const char *spectre_v2_module_string(void) { return ""; } static void __init spec2_print_if_insecure(const char *reason) { if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - pr_info("%s\n", reason); + pr_info("%s selected on command line.\n", reason); } static void __init spec2_print_if_secure(const char *reason) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - pr_info("%s\n", reason); + pr_info("%s selected on command line.\n", reason); } static inline bool retp_compiler(void) @@ -139,42 +139,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt) return len == arglen && !strncmp(arg, opt, len); } +static const struct { + const char *option; + enum spectre_v2_mitigation_cmd cmd; + bool secure; +} mitigation_options[] = { + { "off", SPECTRE_V2_CMD_NONE, false }, + { "on", SPECTRE_V2_CMD_FORCE, true }, + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "auto", SPECTRE_V2_CMD_AUTO, false }, +}; + static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { char arg[20]; - int ret; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, - sizeof(arg)); - if (ret > 0) { - if (match_option(arg, ret, "off")) { - goto disable; - } else if (match_option(arg, ret, "on")) { - spec2_print_if_secure("force enabled on command line."); - return SPECTRE_V2_CMD_FORCE; - } else if (match_option(arg, ret, "retpoline")) { - spec2_print_if_insecure("retpoline selected on command line."); - return SPECTRE_V2_CMD_RETPOLINE; - } else if (match_option(arg, ret, "retpoline,amd")) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec2_print_if_insecure("AMD retpoline selected on command line."); - return SPECTRE_V2_CMD_RETPOLINE_AMD; - } else if (match_option(arg, ret, "retpoline,generic")) { - spec2_print_if_insecure("generic retpoline selected on command line."); - return SPECTRE_V2_CMD_RETPOLINE_GENERIC; - } else if (match_option(arg, ret, "auto")) { + int ret, i; + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_NONE; + else { + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, + sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { + if (!match_option(arg, ret, mitigation_options[i].option)) + continue; + cmd = mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } } - if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if ((cmd == SPECTRE_V2_CMD_RETPOLINE || + cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + !IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; -disable: - spec2_print_if_insecure("disabled on command line."); - return SPECTRE_V2_CMD_NONE; + } + + if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + + if (mitigation_options[i].secure) + spec2_print_if_secure(mitigation_options[i].option); + else + spec2_print_if_insecure(mitigation_options[i].option); + + return cmd; } /* Check for Skylake-like CPUs (for RSB handling) */ From 4b234a253e52b1ae98449d390cd00f07b96ca7e7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Feb 2018 22:39:23 +0100 Subject: [PATCH 228/247] x86/pti: Mark constant arrays as __initconst (cherry picked from commit 4bf5d56d429cbc96c23d809a08f63cd29e1a702e) I'm seeing build failures from the two newly introduced arrays that are marked 'const' and '__initdata', which are mutually exclusive: arch/x86/kernel/cpu/common.c:882:43: error: 'cpu_no_speculation' causes a section type conflict with 'e820_table_firmware_init' arch/x86/kernel/cpu/common.c:895:43: error: 'cpu_no_meltdown' causes a section type conflict with 'e820_table_firmware_init' The correct annotation is __initconst. Fixes: fec9434a12f3 ("x86/pti: Do not enable PTI on CPUs which are not vulnerable to Meltdown") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Ricardo Neri Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Thomas Garnier Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180202213959.611210-1-arnd@arndb.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 60e537dac7188..08e89ed6aa87b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -861,7 +861,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -static const __initdata struct x86_cpu_id cpu_no_speculation[] = { +static const __initconst struct x86_cpu_id cpu_no_speculation[] = { { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, @@ -874,7 +874,7 @@ static const __initdata struct x86_cpu_id cpu_no_speculation[] = { {} }; -static const __initdata struct x86_cpu_id cpu_no_meltdown[] = { +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { { X86_VENDOR_AMD }, {} }; From 34900390e96663cfe5c23e254baaf49664be8836 Mon Sep 17 00:00:00 2001 From: Darren Kenny Date: Fri, 2 Feb 2018 19:12:20 +0000 Subject: [PATCH 229/247] x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL (cherry picked from commit af189c95a371b59f493dbe0f50c0a09724868881) Fixes: 117cc7a908c83 ("x86/retpoline: Fill return stack buffer on vmexit") Signed-off-by: Darren Kenny Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Cc: Tom Lendacky Cc: Andi Kleen Cc: Borislav Petkov Cc: Masami Hiramatsu Cc: Arjan van de Ven Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180202191220.blvgkgutojecxr3b@starbug-vm.ie.oracle.com Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index df4ececa6ebfb..300cc159b4a0a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -150,7 +150,7 @@ extern char __indirect_thunk_end[]; * On VMEXIT we must ensure that no RSB predictions learned in the guest * can be followed in the host, by overwriting the RSB completely. Both * retpoline and IBRS mitigations for Spectre v2 need this; only on future - * CPUs with IBRS_ATT *might* it be avoided. + * CPUs with IBRS_ALL *might* it be avoided. */ static inline void vmexit_fill_RSB(void) { From 19b1d4bdfe5ce543e54292fb1a45c7300398e6aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Jan 2017 11:58:57 +0100 Subject: [PATCH 230/247] KVM: nVMX: kmap() can't fail commit 42cf014d38d8822cce63703a467e00f65d000952 upstream. kmap() can't fail, therefore it will always return a valid pointer. Let's just get rid of the unnecessary checks. Signed-off-by: David Hildenbrand Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index feadff3121314..0726a66c07ab8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4756,10 +4756,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) return 0; vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -9584,11 +9580,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, if (!page) return false; msr_bitmap_l1 = (unsigned long *)kmap(page); - if (!msr_bitmap_l1) { - nested_release_page_clean(page); - WARN_ON(1); - return false; - } memset(msr_bitmap_l0, 0xff, PAGE_SIZE); From 1edccf20b9d82d318f0003ad67b8afed299ae93e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Jan 2017 11:58:58 +0100 Subject: [PATCH 231/247] KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail (cherry picked from commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f) vmx_complete_nested_posted_interrupt() can't fail, let's turn it into a void function. Signed-off-by: David Hildenbrand Signed-off-by: Paolo Bonzini Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0726a66c07ab8..91d17ab53e2b5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4736,7 +4736,7 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -4747,13 +4747,13 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmx->nested.pi_pending) { vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; + return; max_irr = find_last_bit( (unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr == 256) - return 0; + return; vapic_page = kmap(vmx->nested.virtual_apic_page); __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); @@ -4766,7 +4766,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } - return 0; } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -10482,7 +10481,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); + vmx_complete_nested_posted_interrupt(vcpu); + return 0; } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) From b7649e1776706c28d837d423412c1763900e521e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Tue, 1 Aug 2017 14:00:40 -0700 Subject: [PATCH 232/247] KVM: nVMX: mark vmcs12 pages dirty on L2 exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit c9f04407f2e0b3fc9ff7913c65fcfcb0a4b61570) The host physical addresses of L1's Virtual APIC Page and Posted Interrupt descriptor are loaded into the VMCS02. The CPU may write to these pages via their host physical address while L2 is running, bypassing address-translation-based dirty tracking (e.g. EPT write protection). Mark them dirty on every exit from L2 to prevent them from getting out of sync with dirty tracking. Also mark the virtual APIC page and the posted interrupt descriptor dirty when KVM is virtualizing posted interrupt processing. Signed-off-by: David Matlack Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 53 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 91d17ab53e2b5..e91deda4864ec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4736,6 +4736,28 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4743,18 +4765,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) void *vapic_page; u16 status; - if (vmx->nested.pi_desc && - vmx->nested.pi_pending) { - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit( - (unsigned long *)vmx->nested.pi_desc->pir, 256); + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + return; - if (max_irr == 256) - return; + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return; + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4766,6 +4785,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } + + nested_mark_vmcs12_pages_dirty(vcpu); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -8022,6 +8043,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) vmcs_read32(VM_EXIT_INTR_ERROR_CODE), KVM_ISA_VMX); + /* + * The host physical addresses of some pages of guest memory + * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU + * may write to these pages via their host physical address while + * L2 is running, bypassing any address-translation-based dirty + * tracking (e.g. EPT write protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + if (vmx->nested.nested_run_pending) return false; From 46e24dfc2dfe0062a77538698c84f13b24ce9b2c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 27 Nov 2017 17:22:25 -0600 Subject: [PATCH 233/247] KVM: nVMX: Eliminate vmcs02 pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit de3a0021a60635de96aa92713c1a31a96747d72c) The potential performance advantages of a vmcs02 pool have never been realized. To simplify the code, eliminate the pool. Instead, a single vmcs02 is allocated per VCPU when the VCPU enters VMX operation. Cc: stable@vger.kernel.org # prereq for Spectre mitigation Signed-off-by: Jim Mattson Signed-off-by: Mark Kanda Reviewed-by: Ameya More Reviewed-by: David Hildenbrand Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 146 +++++++-------------------------------------- 1 file changed, 23 insertions(+), 123 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e91deda4864ec..44e40952c7fc5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -174,7 +174,6 @@ module_param(ple_window_max, int, S_IRUGO); extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 struct vmcs { u32 revision_id; @@ -208,7 +207,7 @@ struct shared_msr_entry { * stored in guest memory specified by VMPTRLD, but is opaque to the guest, * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the * underlying hardware which will be used to run L2. * This structure is packed to ensure that its layout is identical across * machines (necessary for live migration). @@ -387,13 +386,6 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -420,15 +412,15 @@ struct nested_vmx { */ bool sync_shadow_vmcs; - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; + + struct loaded_vmcs vmcs02; + /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; struct page *virtual_apic_page; @@ -6677,94 +6669,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } -/* - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. - * We could reuse a single VMCS for all the L2 guests, but we also want the - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this - * allows keeping them loaded on the processor, and in the future will allow - * optimizations where prepare_vmcs02 doesn't need to set all the fields on - * every entry if they never change. - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. - * - * The following functions allocate and free a vmcs02 in this pool. - */ - -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmx->nested.current_vmptr) { - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { - /* Recycle the least recently used VMCS. */ - item = list_last_entry(&vmx->nested.vmcs02_pool, - struct vmcs02_list, list); - item->vmptr = vmx->nested.current_vmptr; - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - /* Create a new VMCS */ - item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); - if (!item) - return NULL; - item->vmcs02.vmcs = alloc_vmcs(); - item->vmcs02.shadow_vmcs = NULL; - if (!item->vmcs02.vmcs) { - kfree(item); - return NULL; - } - loaded_vmcs_init(&item->vmcs02); - item->vmptr = vmx->nested.current_vmptr; - list_add(&(item->list), &(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num++; - return &item->vmcs02; -} - -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmptr) { - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - return; - } -} - -/* - * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs - * must be &vmx->vmcs01. - */ -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item, *n; - - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - /* - * Something will leak if the above WARN triggers. Better than - * a use-after-free. - */ - if (vmx->loaded_vmcs == &item->vmcs02) - continue; - - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - } -} - /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * set the success or error code of an emulated VMX instruction, as specified @@ -7078,6 +6982,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + vmx->nested.vmcs02.vmcs = alloc_vmcs(); + vmx->nested.vmcs02.shadow_vmcs = NULL; + if (!vmx->nested.vmcs02.vmcs) + goto out_vmcs02; + loaded_vmcs_init(&vmx->nested.vmcs02); + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); @@ -7100,9 +7010,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->vmcs01.shadow_vmcs = shadow_vmcs; } - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; @@ -7120,6 +7027,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) free_page((unsigned long)vmx->nested.msr_bitmap); out_msr_bitmap: + free_loaded_vmcs(&vmx->nested.vmcs02); + +out_vmcs02: return -ENOMEM; } @@ -7205,7 +7115,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); - /* Unpin physical memory we referred to in current vmcs02 */ + /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; @@ -7221,7 +7131,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } - nested_free_all_saved_vmcss(vmx); + free_loaded_vmcs(&vmx->nested.vmcs02); } /* Emulate the VMXOFF instruction */ @@ -7255,8 +7165,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - nested_free_vmcs02(vmx, vmptr); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; @@ -8045,10 +7953,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) /* * The host physical addresses of some pages of guest memory - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU - * may write to these pages via their host physical address while - * L2 is running, bypassing any address-translation-based dirty - * tracking (e.g. EPT write protection). + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. @@ -10212,7 +10121,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; - struct loaded_vmcs *vmcs02; bool ia32e; u32 msr_entry_idx; @@ -10352,17 +10260,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ - vmcs02 = nested_get_current_vmcs02(vmx); - if (!vmcs02) - return -ENOMEM; - enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; + vmx->loaded_vmcs = &vmx->nested.vmcs02; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); vcpu->cpu = cpu; @@ -10877,10 +10781,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); - /* if no vmcs02 cache requested, remove the one we used */ - if (VMCS02_POOL_SIZE == 0) - nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ From ff546f9d83d320bc81b72bd2ccd4ebae45d3d714 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jan 2018 12:16:15 +0100 Subject: [PATCH 234/247] KVM: VMX: introduce alloc_loaded_vmcs (cherry picked from commit f21f165ef922c2146cc5bdc620f542953c41714b) Group together the calls to alloc_vmcs and loaded_vmcs_init. Soon we'll also allocate an MSR bitmap there. Cc: stable@vger.kernel.org # prereq for Spectre mitigation Signed-off-by: Paolo Bonzini Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 44e40952c7fc5..da3cc79755563 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3522,11 +3522,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) return vmcs; } -static struct vmcs *alloc_vmcs(void) -{ - return alloc_vmcs_cpu(raw_smp_processor_id()); -} - static void free_vmcs(struct vmcs *vmcs) { free_pages((unsigned long)vmcs, vmcs_config.order); @@ -3545,6 +3540,22 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } +static struct vmcs *alloc_vmcs(void) +{ + return alloc_vmcs_cpu(raw_smp_processor_id()); +} + +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + loaded_vmcs->vmcs = alloc_vmcs(); + if (!loaded_vmcs->vmcs) + return -ENOMEM; + + loaded_vmcs->shadow_vmcs = NULL; + loaded_vmcs_init(loaded_vmcs); + return 0; +} + static void free_kvm_area(void) { int cpu; @@ -6943,6 +6954,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) struct vmcs *shadow_vmcs; const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + int r; /* The Intel VMX Instruction Reference lists a bunch of bits that * are prerequisite to running VMXON, most notably cr4.VMXE must be @@ -6982,11 +6994,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - vmx->nested.vmcs02.vmcs = alloc_vmcs(); - vmx->nested.vmcs02.shadow_vmcs = NULL; - if (!vmx->nested.vmcs02.vmcs) + r = alloc_loaded_vmcs(&vmx->nested.vmcs02); + if (r < 0) goto out_vmcs02; - loaded_vmcs_init(&vmx->nested.vmcs02); if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = @@ -9107,17 +9117,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->guest_msrs) goto free_pml; - vmx->loaded_vmcs = &vmx->vmcs01; - vmx->loaded_vmcs->vmcs = alloc_vmcs(); - vmx->loaded_vmcs->shadow_vmcs = NULL; - if (!vmx->loaded_vmcs->vmcs) - goto free_msrs; if (!vmm_exclusive) kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); - loaded_vmcs_init(vmx->loaded_vmcs); + err = alloc_loaded_vmcs(&vmx->vmcs01); if (!vmm_exclusive) kvm_cpu_vmxoff(); + if (err < 0) + goto free_msrs; + vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; From 6236b782eba37a028972bdfd654773ff2e283a22 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 16 Jan 2018 16:51:18 +0100 Subject: [PATCH 235/247] KVM: VMX: make MSR bitmaps per-VCPU (cherry picked from commit 904e14fb7cb96401a7dc803ca2863fd5ba32ffe6) Place the MSR bitmap in struct loaded_vmcs, and update it in place every time the x2apic or APICv state can change. This is rare and the loop can handle 64 MSRs per iteration, in a similar fashion as nested_vmx_prepare_msr_bitmap. This prepares for choosing, on a per-VM basis, whether to intercept the SPEC_CTRL and PRED_CMD MSRs. Cc: stable@vger.kernel.org # prereq for Spectre mitigation Suggested-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 314 ++++++++++++++++----------------------------- 1 file changed, 114 insertions(+), 200 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index da3cc79755563..d772e88ae8f30 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -110,6 +110,14 @@ static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +#define MSR_TYPE_RW 3 + +#define MSR_BITMAP_MODE_X2APIC 1 +#define MSR_BITMAP_MODE_X2APIC_APICV 2 +#define MSR_BITMAP_MODE_LM 4 + #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ @@ -191,6 +199,7 @@ struct loaded_vmcs { struct vmcs *shadow_vmcs; int cpu; int launched; + unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; }; @@ -429,8 +438,6 @@ struct nested_vmx { bool pi_pending; u16 posted_intr_nv; - unsigned long *msr_bitmap; - struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -531,6 +538,7 @@ struct vcpu_vmx { unsigned long host_rsp; u8 fail; bool nmi_known_unmasked; + u8 msr_bitmap_mode; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -902,6 +910,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -921,12 +930,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -2520,36 +2523,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) vmx->guest_msrs[from] = tmp; } -static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) -{ - unsigned long *msr_bitmap; - - if (is_guest_mode(vcpu)) - msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; - else if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; - } else { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; - } - } else { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - } - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); -} - /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -2590,7 +2563,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->save_nmsrs = save_nmsrs; if (cpu_has_vmx_msr_bitmap()) - vmx_set_msr_bitmap(&vmx->vcpu); + vmx_update_msr_bitmap(&vmx->vcpu); } /* @@ -3537,6 +3510,8 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; + if (loaded_vmcs->msr_bitmap) + free_page((unsigned long)loaded_vmcs->msr_bitmap); WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } @@ -3553,7 +3528,18 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs->shadow_vmcs = NULL; loaded_vmcs_init(loaded_vmcs); + + if (cpu_has_vmx_msr_bitmap()) { + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!loaded_vmcs->msr_bitmap) + goto out_vmcs; + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); + } return 0; + +out_vmcs: + free_loaded_vmcs(loaded_vmcs); + return -ENOMEM; } static void free_kvm_area(void) @@ -4562,10 +4548,8 @@ static void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -4599,8 +4583,8 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) +static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -4634,6 +4618,15 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, } } +static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type, bool value) +{ + if (value) + vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + else + vmx_disable_intercept_for_msr(msr_bitmap, msr, type); +} + /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4680,58 +4673,68 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, } } -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) { - if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, - msr, MSR_TYPE_R | MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, - msr, MSR_TYPE_R | MSR_TYPE_W); -} + u8 mode = 0; -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); + if (cpu_has_secondary_exec_ctrls() && + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode |= MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; } + + if (is_long_mode(vcpu)) + mode |= MSR_BITMAP_MODE_LM; + + return mode; } -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, + u8 mode) { - if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); + int msr; + + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; + msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + } + + if (mode & MSR_BITMAP_MODE_X2APIC) { + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + } } } -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { - if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - } + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + u8 mode = vmx_msr_bitmap_mode(vcpu); + u8 changed = mode ^ vmx->msr_bitmap_mode; + + if (!changed) + return; + + vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_LM)); + + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + + vmx->msr_bitmap_mode = mode; } static bool vmx_get_enable_apicv(void) @@ -4976,7 +4979,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } if (cpu_has_vmx_msr_bitmap()) - vmx_set_msr_bitmap(vcpu); + vmx_update_msr_bitmap(vcpu); } static u32 vmx_exec_control(struct vcpu_vmx *vmx) @@ -5065,7 +5068,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); } if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ @@ -6396,7 +6399,7 @@ static void wakeup_handler(void) static __init int hardware_setup(void) { - int r = -ENOMEM, i, msr; + int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -6411,41 +6414,13 @@ static __init int hardware_setup(void) if (!vmx_io_bitmap_b) goto out; - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) - goto out3; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out4; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out5; - - vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) - goto out6; - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out7; + goto out1; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out8; + goto out2; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6454,12 +6429,9 @@ static __init int hardware_setup(void) memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out9; + goto out3; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6516,47 +6488,8 @@ static __init int hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 48; } - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - - memcpy(vmx_msr_bitmap_legacy_x2apic, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, - vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - vmx_msr_bitmap_longmode, PAGE_SIZE); - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - /* - * enable_apicv && kvm_vcpu_apicv_active() - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr, true); - - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839, true); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808, true); - /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b, true); - /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f, true); - - /* - * (enable_apicv && !kvm_vcpu_apicv_active()) || - * !enable_apicv - */ - /* TPR */ - vmx_disable_intercept_msr_read_x2apic(0x808, false); - vmx_disable_intercept_msr_write_x2apic(0x808, false); - if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, @@ -6602,22 +6535,10 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out9: - free_page((unsigned long)vmx_vmwrite_bitmap); -out8: - free_page((unsigned long)vmx_vmread_bitmap); -out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); -out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_vmwrite_bitmap); out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); + free_page((unsigned long)vmx_vmread_bitmap); out1: free_page((unsigned long)vmx_io_bitmap_b); out: @@ -6628,12 +6549,6 @@ static __init int hardware_setup(void) static __exit void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); @@ -6998,13 +6913,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - if (cpu_has_vmx_msr_bitmap()) { - vmx->nested.msr_bitmap = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx->nested.msr_bitmap) - goto out_msr_bitmap; - } - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; @@ -7034,9 +6942,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: - free_page((unsigned long)vmx->nested.msr_bitmap); - -out_msr_bitmap: free_loaded_vmcs(&vmx->nested.vmcs02); out_vmcs02: @@ -7115,10 +7020,6 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); - if (vmx->nested.msr_bitmap) { - free_page((unsigned long)vmx->nested.msr_bitmap); - vmx->nested.msr_bitmap = NULL; - } if (enable_shadow_vmcs) { vmcs_clear(vmx->vmcs01.shadow_vmcs); free_vmcs(vmx->vmcs01.shadow_vmcs); @@ -8465,7 +8366,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) } vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); - vmx_set_msr_bitmap(vcpu); + vmx_update_msr_bitmap(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) @@ -9085,6 +8986,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + unsigned long *msr_bitmap; int cpu; if (!vmx) @@ -9125,6 +9027,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err < 0) goto free_msrs; + msr_bitmap = vmx->vmcs01.msr_bitmap; + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx->msr_bitmap_mode = 0; + vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -9519,7 +9430,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, int msr; struct page *page; unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) @@ -10034,6 +9945,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); + if (enable_vpid) { /* * There is no direct mapping between vpid02 and vpid12, the @@ -10738,7 +10652,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_DEBUGCTL, 0); if (cpu_has_vmx_msr_bitmap()) - vmx_set_msr_bitmap(vcpu); + vmx_update_msr_bitmap(vcpu); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) From 7013129a4034ea2168a0ccb32d7ddfefe9123333 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Thu, 1 Feb 2018 22:59:43 +0100 Subject: [PATCH 236/247] KVM/x86: Add IBPB support (cherry picked from commit 15d45071523d89b3fb7372e2135fbd72f6af9506) The Indirect Branch Predictor Barrier (IBPB) is an indirect branch control mechanism. It keeps earlier branches from influencing later ones. Unlike IBRS and STIBP, IBPB does not define a new mode of operation. It's a command that ensures predicted branch targets aren't used after the barrier. Although IBRS and IBPB are enumerated by the same CPUID enumeration, IBPB is very different. IBPB helps mitigate against three potential attacks: * Mitigate guests from being attacked by other guests. - This is addressed by issing IBPB when we do a guest switch. * Mitigate attacks from guest/ring3->host/ring3. These would require a IBPB during context switch in host, or after VMEXIT. The host process has two ways to mitigate - Either it can be compiled with retpoline - If its going through context switch, and has set !dumpable then there is a IBPB in that path. (Tim's patch: https://patchwork.kernel.org/patch/10192871) - The case where after a VMEXIT you return back to Qemu might make Qemu attackable from guest when Qemu isn't compiled with retpoline. There are issues reported when doing IBPB on every VMEXIT that resulted in some tsc calibration woes in guest. * Mitigate guest/ring0->host/ring0 attacks. When host kernel is using retpoline it is safe against these attacks. If host kernel isn't using retpoline we might need to do a IBPB flush on every VMEXIT. Even when using retpoline for indirect calls, in certain conditions 'ret' can use the BTB on Skylake-era CPUs. There are other mitigations available like RSB stuffing/clearing. * IBPB is issued only for SVM during svm_free_vcpu(). VMX has a vmclear and SVM doesn't. Follow discussion here: https://lkml.org/lkml/2018/1/15/146 Please refer to the following spec for more details on the enumeration and control. Refer here to get documentation about mitigations. https://software.intel.com/en-us/side-channel-security-support [peterz: rebase and changelog rewrite] [karahmed: - rebase - vmx: expose PRED_CMD if guest has it in CPUID - svm: only pass through IBPB if guest has it in CPUID - vmx: support !cpu_has_vmx_msr_bitmap()] - vmx: support nested] [dwmw2: Expose CPUID bit too (AMD IBPB only for now as we lack IBRS) PRED_CMD is a write-only MSR] Signed-off-by: Ashok Raj Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: David Woodhouse Signed-off-by: KarimAllah Ahmed Signed-off-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Cc: Andrea Arcangeli Cc: Andi Kleen Cc: kvm@vger.kernel.org Cc: Asit Mallick Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Arjan Van De Ven Cc: Greg KH Cc: Jun Nakajima Cc: Paolo Bonzini Cc: Dan Williams Cc: Tim Chen Link: http://lkml.kernel.org/r/1515720739-43819-6-git-send-email-ashok.raj@intel.com Link: https://lkml.kernel.org/r/1517522386-18410-3-git-send-email-karahmed@amazon.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/cpuid.c | 11 +++++- arch/x86/kvm/cpuid.h | 12 +++++++ arch/x86/kvm/svm.c | 28 ++++++++++++++++ arch/x86/kvm/vmx.c | 79 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 127 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 91af75e373068..6f24483b2b849 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -355,6 +355,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + /* cpuid 0x80000008.ebx */ + const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(IBPB); + /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | @@ -607,7 +611,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (!g_phys_as) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); - entry->ebx = entry->edx = 0; + entry->edx = 0; + /* IBPB isn't necessarily present in hardware cpuid */ + if (boot_cpu_has(X86_FEATURE_IBPB)) + entry->ebx |= F(IBPB); + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); break; } case 0x80000019: diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 9368fecca3ee8..ec4f9dc54c706 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -160,6 +160,18 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) return best && (best->edx & bit(X86_FEATURE_RDTSCP)); } +static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best && (best->ebx & bit(X86_FEATURE_IBPB))) + return true; + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); +} + + /* * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 24af898fb3a69..6de8d6510dbe4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -248,6 +248,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_CSTAR, .always = true }, { .index = MSR_SYSCALL_MASK, .always = true }, #endif + { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -510,6 +511,7 @@ struct svm_cpu_data { struct kvm_ldttss_desc *tss_desc; struct page *save_area; + struct vmcb *current_vmcb; }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); @@ -1644,11 +1646,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); + /* + * The vmcb page can be recycled, causing a false negative in + * svm_vcpu_load(). So do a full IBPB now. + */ + indirect_branch_prediction_barrier(); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); int i; if (unlikely(cpu != vcpu->cpu)) { @@ -1677,6 +1685,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (static_cpu_has(X86_FEATURE_RDTSCP)) wrmsrl(MSR_TSC_AUX, svm->tsc_aux); + if (sd->current_vmcb != svm->vmcb) { + sd->current_vmcb = svm->vmcb; + indirect_branch_prediction_barrier(); + } avic_vcpu_load(vcpu, cpu); } @@ -3599,6 +3611,22 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr); break; + case MSR_IA32_PRED_CMD: + if (!msr->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) + return 1; + + if (data & ~PRED_CMD_IBPB) + return 1; + + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + if (is_guest_mode(vcpu)) + break; + set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + break; case MSR_STAR: svm->vmcb->save.star = data; break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d772e88ae8f30..b58b7819e8902 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -550,6 +550,7 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif + u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; /* @@ -911,6 +912,8 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1846,6 +1849,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +/* + * Check if MSR is intercepted for L01 MSR bitmap. + */ +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { @@ -2255,6 +2281,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); + indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -3056,6 +3083,33 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_PRED_CMD: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibpb(vcpu)) + return 1; + + if (data & ~PRED_CMD_IBPB) + return 1; + + if (!data) + break; + + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_merge_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. + */ + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, + MSR_TYPE_W); + break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) @@ -9431,9 +9485,23 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, struct page *page; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; + /* + * pred_cmd is trying to verify two things: + * + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This + * ensures that we do not accidentally generate an L02 MSR bitmap + * from the L12 MSR bitmap that is too permissive. + * 2. That L1 or L2s have actually used the MSR. This avoids + * unnecessarily merging of the bitmap if the MSR is unused. This + * works properly because we only update the L01 MSR bitmap lazily. + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only + * updated to reflect this when L1 (or its L2s) actually write to + * the MSR. + */ + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); - /* This shortcut is ok because we support only x2APIC MSRs so far. */ - if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && + !pred_cmd) return false; page = nested_get_page(vcpu, vmcs12->msr_bitmap); @@ -9466,6 +9534,13 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, MSR_TYPE_W); } } + + if (pred_cmd) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, + MSR_TYPE_W); + kunmap(page); nested_release_page_clean(page); From 755502f810c646a1d828fbc66364c252430b6064 Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed Date: Thu, 1 Feb 2018 22:59:44 +0100 Subject: [PATCH 237/247] KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES (cherry picked from commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd) Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO (bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the contents will come directly from the hardware, but user-space can still override it. [dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional] Signed-off-by: KarimAllah Ahmed Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Paolo Bonzini Reviewed-by: Darren Kenny Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Jun Nakajima Cc: kvm@vger.kernel.org Cc: Dave Hansen Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Asit Mallick Cc: Arjan Van De Ven Cc: Greg KH Cc: Dan Williams Cc: Tim Chen Cc: Ashok Raj Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/cpuid.c | 8 +++++++- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 1 + 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6f24483b2b849..9c6493f82094d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -380,6 +380,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + F(ARCH_CAPABILITIES); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -462,12 +466,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_7_EDX); } else { entry->ebx = 0; entry->ecx = 0; + entry->edx = 0; } entry->eax = 0; - entry->edx = 0; break; } case 9: diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index ec4f9dc54c706..8719997dd2e45 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -171,6 +171,14 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); } +static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES)); +} + /* * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b58b7819e8902..1bd0caf87c94b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -551,6 +551,8 @@ struct vcpu_vmx { u64 msr_guest_kernel_gs_base; #endif + u64 arch_capabilities; + u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; /* @@ -2979,6 +2981,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: msr_info->data = guest_read_tsc(vcpu); break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has_arch_capabilities(vcpu)) + return 1; + msr_info->data = to_vmx(vcpu)->arch_capabilities; + break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; @@ -3110,6 +3118,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; + case MSR_IA32_ARCH_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + vmx->arch_capabilities = data; + break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) @@ -5196,6 +5209,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e023ef981feb8..94d1573a717b1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,6 +975,7 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_ARCH_CAPABILITIES }; static unsigned num_msrs_to_save; From e5a83419c957edff9290a7e09b951f44af7fa2e2 Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed Date: Thu, 1 Feb 2018 22:59:45 +0100 Subject: [PATCH 238/247] KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL (cherry picked from commit d28b387fb74da95d69d2615732f50cceb38e9a4d) [ Based on a patch from Ashok Raj ] Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for guests that will only mitigate Spectre V2 through IBRS+IBPB and will not be using a retpoline+IBPB based approach. To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for guests that do not actually use the MSR, only start saving and restoring when a non-zero is written to it. No attempt is made to handle STIBP here, intentionally. Filtering STIBP may be added in a future patch, which may require trapping all writes if we don't want to pass it through directly to the guest. [dwmw2: Clean up CPUID bits, save/restore manually, handle reset] Signed-off-by: KarimAllah Ahmed Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Darren Kenny Reviewed-by: Konrad Rzeszutek Wilk Reviewed-by: Jim Mattson Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Jun Nakajima Cc: kvm@vger.kernel.org Cc: Dave Hansen Cc: Tim Chen Cc: Andy Lutomirski Cc: Asit Mallick Cc: Arjan Van De Ven Cc: Greg KH Cc: Paolo Bonzini Cc: Dan Williams Cc: Linus Torvalds Cc: Ashok Raj Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/cpuid.c | 8 ++-- arch/x86/kvm/cpuid.h | 11 +++++ arch/x86/kvm/vmx.c | 103 ++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 2 +- 4 files changed, 118 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9c6493f82094d..93f924de06cf1 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(IBPB); + F(IBPB) | F(IBRS); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = - F(ARCH_CAPABILITIES); + F(SPEC_CTRL) | F(ARCH_CAPABILITIES); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -618,9 +618,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - /* IBPB isn't necessarily present in hardware cpuid */ + /* IBRS and IBPB aren't necessarily present in hardware cpuid */ if (boot_cpu_has(X86_FEATURE_IBPB)) entry->ebx |= F(IBPB); + if (boot_cpu_has(X86_FEATURE_IBRS)) + entry->ebx |= F(IBRS); entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); break; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8719997dd2e45..d1beb7156704b 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -171,6 +171,17 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); } +static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best && (best->ebx & bit(X86_FEATURE_IBRS))) + return true; + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); +} + static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1bd0caf87c94b..d49da86e30992 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -552,6 +552,7 @@ struct vcpu_vmx { #endif u64 arch_capabilities; + u64 spec_ctrl; u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; @@ -1851,6 +1852,29 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +/* + * Check if MSR is intercepted for currently loaded MSR bitmap. + */ +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +{ + unsigned long *msr_bitmap; + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return true; + + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; + + if (msr <= 0x1fff) { + return !!test_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + return !!test_bit(msr, msr_bitmap + 0xc00 / f); + } + + return true; +} + /* * Check if MSR is intercepted for L01 MSR bitmap. */ @@ -2981,6 +3005,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: msr_info->data = guest_read_tsc(vcpu); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibrs(vcpu)) + return 1; + + msr_info->data = to_vmx(vcpu)->spec_ctrl; + break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has_arch_capabilities(vcpu)) @@ -3091,6 +3122,36 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibrs(vcpu)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + return 1; + + vmx->spec_ctrl = data; + + if (!data) + break; + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_vmx_merge_msr_bitmap. We should not touch the + * vmcs02.msr_bitmap here since it gets completely overwritten + * in the merging. We update the vmcs01 here for L1 as well + * since it will end up touching the MSR anyway now. + */ + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_RW); + break; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has_ibpb(vcpu)) @@ -5239,6 +5300,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u64 cr0; vmx->rmode.vm86_active = 0; + vmx->spec_ctrl = 0; vmx->soft_vnmi_blocked = 0; @@ -8824,6 +8886,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_arm_hv_timer(vcpu); + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -8942,6 +9013,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. + * + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + + if (vmx->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); @@ -9501,7 +9593,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; /* - * pred_cmd is trying to verify two things: + * pred_cmd & spec_ctrl are trying to verify two things: * * 1. L0 gave a permission to L1 to actually passthrough the MSR. This * ensures that we do not accidentally generate an L02 MSR bitmap @@ -9514,9 +9606,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, * the MSR. */ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD); + bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL); if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && - !pred_cmd) + !pred_cmd && !spec_ctrl) return false; page = nested_get_page(vcpu, vmcs12->msr_bitmap); @@ -9550,6 +9643,12 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, } } + if (spec_ctrl) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, + MSR_TYPE_R | MSR_TYPE_W); + if (pred_cmd) nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94d1573a717b1..75f756eac9797 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -975,7 +975,7 @@ static u32 msrs_to_save[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES }; static unsigned num_msrs_to_save; From fc00dde96099a1a331a683a87b23ccb4626d816a Mon Sep 17 00:00:00 2001 From: KarimAllah Ahmed Date: Sat, 3 Feb 2018 15:56:23 +0100 Subject: [PATCH 239/247] KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL (cherry picked from commit b2ac58f90540e39324e7a29a7ad471407ae0bf48) [ Based on a patch from Paolo Bonzini ] ... basically doing exactly what we do for VMX: - Passthrough SPEC_CTRL to guests (if enabled in guest CPUID) - Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest actually used it. Signed-off-by: KarimAllah Ahmed Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Reviewed-by: Darren Kenny Reviewed-by: Konrad Rzeszutek Wilk Cc: Andrea Arcangeli Cc: Andi Kleen Cc: Jun Nakajima Cc: kvm@vger.kernel.org Cc: Dave Hansen Cc: Tim Chen Cc: Andy Lutomirski Cc: Asit Mallick Cc: Arjan Van De Ven Cc: Greg KH Cc: Paolo Bonzini Cc: Dan Williams Cc: Linus Torvalds Cc: Ashok Raj Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon.de Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/svm.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6de8d6510dbe4..be644afab1bbd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -183,6 +183,8 @@ struct vcpu_svm { u64 gs_base; } host; + u64 spec_ctrl; + u32 *msrpm; ulong nmi_iret_rip; @@ -248,6 +250,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_CSTAR, .always = true }, { .index = MSR_SYSCALL_MASK, .always = true }, #endif + { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, @@ -863,6 +866,25 @@ static bool valid_msr_intercept(u32 index) return false; } +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) +{ + u8 bit_write; + unsigned long tmp; + u32 offset; + u32 *msrpm; + + msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: + to_svm(vcpu)->msrpm; + + offset = svm_msrpm_offset(msr); + bit_write = 2 * (msr & 0x0f) + 1; + tmp = msrpm[offset]; + + BUG_ON(offset == MSR_INVALID); + + return !!test_bit(bit_write, &tmp); +} + static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { @@ -1537,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u32 dummy; u32 eax = 1; + svm->spec_ctrl = 0; + if (!init_event) { svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; @@ -3520,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_VM_CR: msr_info->data = svm->nested.vm_cr_msr; break; + case MSR_IA32_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has_ibrs(vcpu)) + return 1; + + msr_info->data = svm->spec_ctrl; + break; case MSR_IA32_UCODE_REV: msr_info->data = 0x01000065; break; @@ -3611,6 +3642,33 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr); break; + case MSR_IA32_SPEC_CTRL: + if (!msr->host_initiated && + !guest_cpuid_has_ibrs(vcpu)) + return 1; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + return 1; + + svm->spec_ctrl = data; + + if (!data) + break; + + /* + * For non-nested: + * When it's written (to non-zero) for the first time, pass + * it through. + * + * For nested: + * The handling of the MSR bitmap for L2 guests is done in + * nested_svm_vmrun_msrpm. + * We update the L1 MSR bit as well since it will end up + * touching the MSR anyway now. + */ + set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && !guest_cpuid_has_ibpb(vcpu)) @@ -4854,6 +4912,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); + /* + * If this vCPU has touched SPEC_CTRL, restore the guest's value if + * it's non-zero. Since vmentry is serialising on affected CPUs, there + * is no need to worry about the conditional branch over the wrmsr + * being speculatively taken. + */ + if (svm->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + asm volatile ( "push %%" _ASM_BP "; \n\t" "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" @@ -4946,6 +5013,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* + * We do not use IBRS in the kernel. If this vCPU has used the + * SPEC_CTRL MSR it may have left it on; save the value and + * turn it off. This is much more efficient than blindly adding + * it to the atomic save/restore list. Especially as the former + * (Saving guest MSRs on vmexit) doesn't even exist in KVM. + * + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + + if (svm->spec_ctrl) + wrmsrl(MSR_IA32_SPEC_CTRL, 0); + /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); From 7c17a1e5852a0c1c67a2ac65569270619fe1d576 Mon Sep 17 00:00:00 2001 From: Robert Baronescu Date: Tue, 10 Oct 2017 13:21:59 +0300 Subject: [PATCH 240/247] crypto: tcrypt - fix S/G table for test_aead_speed() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5c6ac1d4f8fbdbed65dbeb8cf149d736409d16a1 upstream. In case buffer length is a multiple of PAGE_SIZE, the S/G table is incorrectly generated. Fix this by handling buflen = k * PAGE_SIZE separately. Signed-off-by: Robert Baronescu Signed-off-by: Herbert Xu Signed-off-by: Horia Geantă Signed-off-by: Greg Kroah-Hartman --- crypto/tcrypt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index e3af318af2db7..2a07341aca462 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -223,11 +223,13 @@ static void sg_init_aead(struct scatterlist *sg, char *xbuf[XBUFSIZE], } sg_init_table(sg, np + 1); - np--; + if (rem) + np--; for (k = 0; k < np; k++) sg_set_buf(&sg[k + 1], xbuf[k], PAGE_SIZE); - sg_set_buf(&sg[k + 1], xbuf[k], rem); + if (rem) + sg_set_buf(&sg[k + 1], xbuf[k], rem); } static void test_aead_speed(const char *algo, int enc, unsigned int secs, From a7de0e9718c3062ed5fad916ceb65f387a29447b Mon Sep 17 00:00:00 2001 From: Julian Scheel Date: Wed, 24 May 2017 12:28:23 +0200 Subject: [PATCH 241/247] ASoC: simple-card: Fix misleading error message commit 7ac45d1635a4cd2e99a4b11903d4a2815ca1b27b upstream. In case cpu could not be found the error message would always refer to /codec/ not being found in DT. Fix this by catching the cpu node not found case explicitly. Signed-off-by: Julian Scheel Signed-off-by: Mark Brown Signed-off-by: thongsyho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/generic/simple-card.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index f608f8d23f3d9..dd88c2cb64703 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -232,13 +232,19 @@ static int asoc_simple_card_dai_link_of(struct device_node *node, snprintf(prop, sizeof(prop), "%scpu", prefix); cpu = of_get_child_by_name(node, prop); + if (!cpu) { + ret = -EINVAL; + dev_err(dev, "%s: Can't find %s DT node\n", __func__, prop); + goto dai_link_of_err; + } + snprintf(prop, sizeof(prop), "%splat", prefix); plat = of_get_child_by_name(node, prop); snprintf(prop, sizeof(prop), "%scodec", prefix); codec = of_get_child_by_name(node, prop); - if (!cpu || !codec) { + if (!codec) { ret = -EINVAL; dev_err(dev, "%s: Can't find %s DT node\n", __func__, prop); goto dai_link_of_err; From 24978c21f7ed183e7bb9745a295d066483b6bd48 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 16 May 2017 01:48:24 +0000 Subject: [PATCH 242/247] ASoC: rsnd: don't call free_irq() on Parent SSI commit 1f8754d4daea5f257370a52a30fcb22798c54516 upstream. If SSI uses shared pin, some SSI will be used as parent SSI. Then, normal SSI's remove and Parent SSI's remove (these are same SSI) will be called when unbind or remove timing. In this case, free_irq() will be called twice. This patch solve this issue. Signed-off-by: Kuninori Morimoto Tested-by: Hiroyuki Yokoyama Reported-by: Hiroyuki Yokoyama Signed-off-by: Mark Brown Signed-off-by: thongsyho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/ssi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index 560cf4b51a994..071fa850fed57 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -699,9 +699,14 @@ static int rsnd_ssi_dma_remove(struct rsnd_mod *mod, struct rsnd_priv *priv) { struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod); + struct rsnd_mod *ssi_parent_mod = rsnd_io_to_mod_ssip(io); struct device *dev = rsnd_priv_to_dev(priv); int irq = ssi->irq; + /* Do nothing for SSI parent mod */ + if (ssi_parent_mod == mod) + return 0; + /* PIO will request IRQ again */ devm_free_irq(dev, irq, mod); From 1cb145c67260edf54f106f031756a6ae780b7a32 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 9 Aug 2017 02:16:20 +0000 Subject: [PATCH 243/247] ASoC: rsnd: avoid duplicate free_irq() commit e0936c3471a8411a5df327641fa3ffe12a2fb07b upstream. commit 1f8754d4daea5f ("ASoC: rsnd: don't call free_irq() on Parent SSI") fixed Parent SSI duplicate free_irq(). But on Renesas Sound, not only Parent SSI but also Multi SSI have same issue. This patch avoid duplicate free_irq() if it was not pure SSI. Fixes: 1f8754d4daea5f ("ASoC: rsnd: don't call free_irq() on Parent SSI") Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: thongsyho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/ssi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index 071fa850fed57..a9a43acce30ec 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -699,12 +699,12 @@ static int rsnd_ssi_dma_remove(struct rsnd_mod *mod, struct rsnd_priv *priv) { struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod); - struct rsnd_mod *ssi_parent_mod = rsnd_io_to_mod_ssip(io); + struct rsnd_mod *pure_ssi_mod = rsnd_io_to_mod_ssi(io); struct device *dev = rsnd_priv_to_dev(priv); int irq = ssi->irq; - /* Do nothing for SSI parent mod */ - if (ssi_parent_mod == mod) + /* Do nothing if non SSI (= SSI parent, multi SSI) mod */ + if (pure_ssi_mod != mod) return 0; /* PIO will request IRQ again */ From 758e22acf4fd9aaf51c7bae93a47401b8f792d56 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 10 Jul 2017 23:46:39 +0300 Subject: [PATCH 244/247] drm: rcar-du: Use the VBK interrupt for vblank events commit cbbb90b0c084d7dfb2ed8e3fecf8df200fbdd2a0 upstream. When implementing support for interlaced modes, the driver switched from reporting vblank events on the vertical blanking (VBK) interrupt to the frame end interrupt (FRM). This incorrectly divided the reported refresh rate by two. Fix it by moving back to the VBK interrupt. Fixes: 906eff7fcada ("drm: rcar-du: Implement support for interlaced modes") Signed-off-by: Laurent Pinchart Reviewed-by: Kieran Bingham Signed-off-by: thongsyho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c index a2ec6d8796a09..848f7f2152eed 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c @@ -551,7 +551,7 @@ static irqreturn_t rcar_du_crtc_irq(int irq, void *arg) status = rcar_du_crtc_read(rcrtc, DSSR); rcar_du_crtc_write(rcrtc, DSRCR, status & DSRCR_MASK); - if (status & DSSR_FRM) { + if (status & DSSR_VBK) { drm_crtc_handle_vblank(&rcrtc->crtc); rcar_du_crtc_finish_page_flip(rcrtc); ret = IRQ_HANDLED; From 230ca8fb951528f298e06a9c15257df5df85ecbd Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sat, 29 Jul 2017 02:31:33 +0300 Subject: [PATCH 245/247] drm: rcar-du: Fix race condition when disabling planes at CRTC stop commit 641307df71fe77d7b38a477067495ede05d47295 upstream. When stopping the CRTC the driver must disable all planes and wait for the change to take effect at the next vblank. Merely calling drm_crtc_wait_one_vblank() is not enough, as the function doesn't include any mechanism to handle the race with vblank interrupts. Replace the drm_crtc_wait_one_vblank() call with a manual mechanism that handles the vblank interrupt race. Signed-off-by: Laurent Pinchart Reviewed-by: Kieran Bingham Signed-off-by: thongsyho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_crtc.c | 53 +++++++++++++++++++++++--- drivers/gpu/drm/rcar-du/rcar_du_crtc.h | 8 ++++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c index 848f7f2152eed..3322b157106dd 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c @@ -392,6 +392,31 @@ static void rcar_du_crtc_start(struct rcar_du_crtc *rcrtc) rcrtc->started = true; } +static void rcar_du_crtc_disable_planes(struct rcar_du_crtc *rcrtc) +{ + struct rcar_du_device *rcdu = rcrtc->group->dev; + struct drm_crtc *crtc = &rcrtc->crtc; + u32 status; + /* Make sure vblank interrupts are enabled. */ + drm_crtc_vblank_get(crtc); + /* + * Disable planes and calculate how many vertical blanking interrupts we + * have to wait for. If a vertical blanking interrupt has been triggered + * but not processed yet, we don't know whether it occurred before or + * after the planes got disabled. We thus have to wait for two vblank + * interrupts in that case. + */ + spin_lock_irq(&rcrtc->vblank_lock); + rcar_du_group_write(rcrtc->group, rcrtc->index % 2 ? DS2PR : DS1PR, 0); + status = rcar_du_crtc_read(rcrtc, DSSR); + rcrtc->vblank_count = status & DSSR_VBK ? 2 : 1; + spin_unlock_irq(&rcrtc->vblank_lock); + if (!wait_event_timeout(rcrtc->vblank_wait, rcrtc->vblank_count == 0, + msecs_to_jiffies(100))) + dev_warn(rcdu->dev, "vertical blanking timeout\n"); + drm_crtc_vblank_put(crtc); +} + static void rcar_du_crtc_stop(struct rcar_du_crtc *rcrtc) { struct drm_crtc *crtc = &rcrtc->crtc; @@ -400,17 +425,16 @@ static void rcar_du_crtc_stop(struct rcar_du_crtc *rcrtc) return; /* Disable all planes and wait for the change to take effect. This is - * required as the DSnPR registers are updated on vblank, and no vblank - * will occur once the CRTC is stopped. Disabling planes when starting - * the CRTC thus wouldn't be enough as it would start scanning out - * immediately from old frame buffers until the next vblank. + * required as the plane enable registers are updated on vblank, and no + * vblank will occur once the CRTC is stopped. Disabling planes when + * starting the CRTC thus wouldn't be enough as it would start scanning + * out immediately from old frame buffers until the next vblank. * * This increases the CRTC stop delay, especially when multiple CRTCs * are stopped in one operation as we now wait for one vblank per CRTC. * Whether this can be improved needs to be researched. */ - rcar_du_group_write(rcrtc->group, rcrtc->index % 2 ? DS2PR : DS1PR, 0); - drm_crtc_wait_one_vblank(crtc); + rcar_du_crtc_disable_planes(rcrtc); /* Disable vertical blanking interrupt reporting. We first need to wait * for page flip completion before stopping the CRTC as userspace @@ -548,9 +572,24 @@ static irqreturn_t rcar_du_crtc_irq(int irq, void *arg) irqreturn_t ret = IRQ_NONE; u32 status; + spin_lock(&rcrtc->vblank_lock); + status = rcar_du_crtc_read(rcrtc, DSSR); rcar_du_crtc_write(rcrtc, DSRCR, status & DSRCR_MASK); + if (status & DSSR_VBK) { + /* + * Wake up the vblank wait if the counter reaches 0. This must + * be protected by the vblank_lock to avoid races in + * rcar_du_crtc_disable_planes(). + */ + if (rcrtc->vblank_count) { + if (--rcrtc->vblank_count == 0) + wake_up(&rcrtc->vblank_wait); + } + } + spin_unlock(&rcrtc->vblank_lock); + if (status & DSSR_VBK) { drm_crtc_handle_vblank(&rcrtc->crtc); rcar_du_crtc_finish_page_flip(rcrtc); @@ -606,6 +645,8 @@ int rcar_du_crtc_create(struct rcar_du_group *rgrp, unsigned int index) } init_waitqueue_head(&rcrtc->flip_wait); + init_waitqueue_head(&rcrtc->vblank_wait); + spin_lock_init(&rcrtc->vblank_lock); rcrtc->group = rgrp; rcrtc->mmio_offset = mmio_offsets[index]; diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.h b/drivers/gpu/drm/rcar-du/rcar_du_crtc.h index 6f08b7e7db067..48bef05b4c625 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.h +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.h @@ -15,6 +15,7 @@ #define __RCAR_DU_CRTC_H__ #include +#include #include #include @@ -33,6 +34,9 @@ struct rcar_du_vsp; * @started: whether the CRTC has been started and is running * @event: event to post when the pending page flip completes * @flip_wait: wait queue used to signal page flip completion + * @vblank_lock: protects vblank_wait and vblank_count + * @vblank_wait: wait queue used to signal vertical blanking + * @vblank_count: number of vertical blanking interrupts to wait for * @outputs: bitmask of the outputs (enum rcar_du_output) driven by this CRTC * @group: CRTC group this CRTC belongs to */ @@ -48,6 +52,10 @@ struct rcar_du_crtc { struct drm_pending_vblank_event *event; wait_queue_head_t flip_wait; + spinlock_t vblank_lock; + wait_queue_head_t vblank_wait; + unsigned int vblank_count; + unsigned int outputs; struct rcar_du_group *group; From 2760f452a71899af860355abe818a8b2bd32b2cb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 12 Oct 2017 13:23:16 +0200 Subject: [PATCH 246/247] x86/microcode: Do the family check first commit 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 upstream with adjustments. On CPUs like AMD's Geode, for example, we shouldn't even try to load microcode because they do not support the modern microcode loading interface. However, we do the family check *after* the other checks whether the loader has been disabled on the command line or whether we're running in a guest. So move the family checks first in order to exit early if we're being loaded on an unsupported family. Reported-and-tested-by: Sven Glodowski Signed-off-by: Borislav Petkov Cc: # 4.11.. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://bugzilla.suse.com/show_bug.cgi?id=1061396 Link: http://lkml.kernel.org/r/20171012112316.977-1-bp@alien8.de Signed-off-by: Ingo Molnar Signed-off-by: Rolf Neugebauer Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/core.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index dc0b9f8763adc..0afaf00b029bb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -86,9 +86,6 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (!have_cpuid_p()) - return *res; - a = 1; c = 0; native_cpuid(&a, &b, &c, &d); @@ -130,8 +127,9 @@ void __init load_ucode_bsp(void) { int vendor; unsigned int family; + bool intel = true; - if (check_loader_disabled_bsp()) + if (!have_cpuid_p()) return; vendor = x86_cpuid_vendor(); @@ -139,16 +137,27 @@ void __init load_ucode_bsp(void) switch (vendor) { case X86_VENDOR_INTEL: - if (family >= 6) - load_ucode_intel_bsp(); + if (family < 6) + return; break; + case X86_VENDOR_AMD: - if (family >= 0x10) - load_ucode_amd_bsp(family); + if (family < 0x10) + return; + intel = false; break; + default: - break; + return; } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(family); } static bool check_loader_disabled_ap(void) From 7f3bd8db99746a60bcae1ec4059a4756d19b63c2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 13 Feb 2018 12:36:03 +0100 Subject: [PATCH 247/247] Linux 4.9.81 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9550b6939076d..4d5753f1c37b3 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 80 +SUBLEVEL = 81 EXTRAVERSION = NAME = Roaring Lionus