Skip to content
Permalink
Browse files

10284 Socket CMCI mismatch can lead to boot hang

Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
  • Loading branch information
rmustacc authored and danmcd committed Jan 28, 2019
1 parent 666e8af commit 918e0d92ec24e67f572737a68faf135dc6409d26
@@ -21,6 +21,7 @@

/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
*/

#ifndef _GCPU_H
@@ -181,6 +182,8 @@ typedef struct gcpu_mce_status {
* Flags for gcpu_mca_flags
*/
#define GCPU_MCA_F_UNFAULTING 0x1 /* CPU exiting faulted state */
#define GCPU_MCA_F_CMCI_CAPABLE 0x2 /* CPU supports CMCI */
#define GCPU_MCA_F_CMCI_ENABLE 0x4 /* CPU CMCI enabled */

/*
* State shared by all cpus on a chip
@@ -213,7 +216,8 @@ extern void gcpu_post_mpstartup(cmi_hdl_t);
extern void gcpu_faulted_enter(cmi_hdl_t);
extern void gcpu_faulted_exit(cmi_hdl_t);
extern void gcpu_mca_init(cmi_hdl_t);
extern void gcpu_mca_fini(cmi_hdl_t hdl);
extern void gcpu_mca_fini(cmi_hdl_t);
extern void gcpu_mca_cmci_enable(cmi_hdl_t);
extern cmi_errno_t gcpu_msrinject(cmi_hdl_t, cmi_mca_regs_t *, uint_t, int);
#ifndef __xpv
extern uint64_t gcpu_mca_trap(cmi_hdl_t, struct regs *);
@@ -223,11 +227,6 @@ extern void gcpu_hdl_poke(cmi_hdl_t);
extern void gcpu_xpv_panic_callback(void);
#endif

/*
* CMI global variable
*/
extern int cmi_enable_cmci;

/*
* Local functions
*/
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2018, Joyent, Inc.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -261,6 +262,15 @@ gcpu_post_startup(cmi_hdl_t hdl)
* be run on cpu 0 so we can assure that by starting from here.
*/
gcpu_mca_poll_start(hdl);
#else
/*
* The boot CPU has a bit of a chicken and egg problem for CMCI. Its MCA
* initialization is run before we have initialized the PSM module that
* we would use for enabling CMCI. Therefore, we use this as a chance to
* enable CMCI for the boot CPU. For all other CPUs, this chicken and
* egg problem will have already been solved.
*/
gcpu_mca_cmci_enable(hdl);
#endif
}

@@ -21,6 +21,7 @@

/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -49,6 +50,7 @@
#include <sys/fm/smb/fmsmb.h>
#include <sys/sysevent.h>
#include <sys/ontrap.h>
#include <sys/smp_impldefs.h>

#include "gcpu.h"

@@ -82,6 +84,12 @@ int gcpu_mca_telemetry_retries = 5;
#ifndef __xpv
int gcpu_mca_cmci_throttling_threshold = 10;
int gcpu_mca_cmci_reenable_threshold = 1000;

/*
* This is used to determine whether or not we have registered the CMCI CPU
* setup function. This is protected by cpu_lock.
*/
static boolean_t gcpu_mca_cpu_registered = B_FALSE;
#endif

static gcpu_error_disp_t gcpu_errtypes[] = {
@@ -1031,6 +1039,83 @@ gcpu_errorq_init(size_t datasz)

static uint_t global_nbanks;

#ifndef __xpv
/*ARGSUSED*/
int
gcpu_cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
{
/*
* In general, we'd expect that in a multi-socket configuration, either
* all CPUs would support CMCI or none of them would. Unfortunately,
* that may not be the case in the wild. While we'd rather check the
* handle's enablement state here, that itself is a bit complicated. We
* don't have a guarantee in a heterogenous situation that the CPU in
* question is using the generic CPU module or not, even though we've
* been registered. As such, we allow the interrupt to be registered and
* written to the local apic anyways. We won't have a CMCI interrupt
* generated anyways because the MCA banks will not be programmed as
* such for that CPU by the polling thread.
*/
switch (what) {
case CPU_ON:
psm_cmci_setup(cpuid, B_TRUE);
break;
case CPU_OFF:
psm_cmci_setup(cpuid, B_FALSE);
break;
default:
break;
}

return (0);
}

void
gcpu_mca_cmci_enable(cmi_hdl_t hdl)
{
gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
gcpu_mca_t *mca = &gcpu->gcpu_mca;

/*
* If this CPU doesn't support CMCI, don't do anything.
*/
if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_CAPABLE) == 0)
return;

/*
* If we don't have support from the PSM module, then there's nothing we
* can do. Note that this changes as we start up the system. The only
* case where it may be mistakenly NULL is for the boot CPU. The boot
* CPU will have this taken care of for it in gcpu_post_startup(), once
* we know for certain whether or not the PSM module supports CMCI.
*/
if (psm_cmci_setup == NULL) {
return;
}

mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_ENABLE;
if (MUTEX_HELD(&cpu_lock)) {
if (!gcpu_mca_cpu_registered) {
register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
gcpu_mca_cpu_registered = B_TRUE;
}
} else {
mutex_enter(&cpu_lock);
if (!gcpu_mca_cpu_registered) {
register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
gcpu_mca_cpu_registered = B_TRUE;
}
mutex_exit(&cpu_lock);
}

/*
* Call the PSM op to make sure that we initialize things on
* this CPU.
*/
psm_cmci_setup(cmi_hdl_logical_id(hdl), B_TRUE);
}
#endif /* !__xpv */

void
gcpu_mca_init(cmi_hdl_t hdl)
{
@@ -1257,8 +1342,10 @@ gcpu_mca_init(cmi_hdl_t hdl)
}

#ifndef __xpv
if (cmci_capable)
cmi_enable_cmci = 1;
if (cmci_capable) {
mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_CAPABLE;
gcpu_mca_cmci_enable(hdl);
}
#endif

#ifndef __xpv
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2018, Joyent, Inc.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -94,7 +95,8 @@ gcpu_ntv_mca_poll(cmi_hdl_t hdl, int what)
ASSERT(MUTEX_HELD(&gcpu->gcpu_shared->gcpus_poll_lock));

/* Enable CMCI in first poll if is supported */
if (cmi_enable_cmci && (!mca->gcpu_mca_first_poll_cmci_enabled)) {
if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_ENABLE) != 0 &&
(!mca->gcpu_mca_first_poll_cmci_enabled)) {
int i;
uint64_t ctl2;

@@ -168,6 +168,7 @@ static struct psm_ops apix_ops = {

apic_get_pir_ipivect,
apic_send_pir_ipi,
apic_cmci_setup
};

struct psm_ops *psmops = &apix_ops;
@@ -546,27 +547,18 @@ apix_init_intr()
apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
}

/* Enable CMCI interrupt */
if (cmi_enable_cmci) {
mutex_enter(&cmci_cpu_setup_lock);
if (cmci_cpu_setup_registered == 0) {
mutex_enter(&cpu_lock);
register_cpu_setup_func(cmci_cpu_setup, NULL);
mutex_exit(&cpu_lock);
cmci_cpu_setup_registered = 1;
}
mutex_exit(&cmci_cpu_setup_lock);

if (apic_cmci_vect == 0) {
int ipl = 0x2;
apic_cmci_vect = apix_get_ipivect(ipl, -1);
ASSERT(apic_cmci_vect);

(void) add_avintr(NULL, ipl,
(avfunc)cmi_cmci_trap, "apic cmci intr",
apic_cmci_vect, NULL, NULL, NULL, NULL);
}
apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
/*
* Ensure a CMCI interrupt is allocated, regardless of whether it is
* enabled or not.
*/
if (apic_cmci_vect == 0) {
const int ipl = 0x2;
apic_cmci_vect = apix_get_ipivect(ipl, -1);
ASSERT(apic_cmci_vect);

(void) add_avintr(NULL, ipl,
(avfunc)cmi_cmci_trap, "apic cmci intr",
apic_cmci_vect, NULL, NULL, NULL, NULL);
}

apic_reg_ops->apic_write_task_reg(0);
@@ -203,6 +203,7 @@ static struct psm_ops apic_ops = {

apic_get_pir_ipivect,
apic_send_pir_ipi,
apic_cmci_setup,
};

struct psm_ops *psmops = &apic_ops;
@@ -428,31 +429,21 @@ apic_init_intr(void)
apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
}

/* Enable CMCI interrupt */
if (cmi_enable_cmci) {

mutex_enter(&cmci_cpu_setup_lock);
if (cmci_cpu_setup_registered == 0) {
mutex_enter(&cpu_lock);
register_cpu_setup_func(cmci_cpu_setup, NULL);
mutex_exit(&cpu_lock);
cmci_cpu_setup_registered = 1;
}
mutex_exit(&cmci_cpu_setup_lock);

if (apic_cmci_vect == 0) {
int ipl = 0x2;
int irq = apic_get_ipivect(ipl, -1);
/*
* Ensure a CMCI interrupt is allocated, regardless of whether it is
* enabled or not.
*/
if (apic_cmci_vect == 0) {
const int ipl = 0x2;
int irq = apic_get_ipivect(ipl, -1);

ASSERT(irq != -1);
apic_cmci_vect = apic_irq_table[irq]->airq_vector;
ASSERT(apic_cmci_vect);
ASSERT(irq != -1);
apic_cmci_vect = apic_irq_table[irq]->airq_vector;
ASSERT(apic_cmci_vect);

(void) add_avintr(NULL, ipl,
(avfunc)cmi_cmci_trap,
"apic cmci intr", irq, NULL, NULL, NULL, NULL);
}
apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
(void) add_avintr(NULL, ipl,
(avfunc)cmi_cmci_trap,
"apic cmci intr", irq, NULL, NULL, NULL, NULL);
}
}

@@ -122,12 +122,8 @@ int apic_enable_cpcovf_intr = 1;

/* vector at which CMCI interrupts come in */
int apic_cmci_vect;
extern int cmi_enable_cmci;
extern void cmi_cmci_trap(void);

kmutex_t cmci_cpu_setup_lock; /* protects cmci_cpu_setup_registered */
int cmci_cpu_setup_registered;

lock_t apic_mode_switch_lock;

int apic_pir_vect;
@@ -382,30 +378,20 @@ apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
return (0);
}

/*ARGSUSED*/
int
cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
void
apic_cmci_setup(processorid_t cpuid, boolean_t enable)
{
cpuset_t cpu_set;

CPUSET_ONLY(cpu_set, cpuid);

switch (what) {
case CPU_ON:
xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
(xc_func_t)apic_cmci_enable);
break;

case CPU_OFF:
xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
(xc_func_t)apic_cmci_disable);
break;

default:
break;
if (enable) {
xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
(xc_func_t)apic_cmci_enable);
} else {
xc_call(NULL, NULL, NULL, CPUSET2BV(cpu_set),
(xc_func_t)apic_cmci_disable);
}

return (0);
}

static void
@@ -179,6 +179,7 @@ static struct psm_ops uppc_ops = {

(int (*)(void))NULL, /* psm_get_pir_ipivect */
(void (*)(processorid_t))NULL, /* psm_send_pir_ipi */
(void (*)(processorid_t, boolean_t))NULL /* psm_cmci_setup */
};


@@ -659,7 +660,7 @@ uppc_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
if (status == ACPI_PSM_SUCCESS) {
acpi_new_irq_cache_ent(busid, devid, ipin, *pci_irqp,
intr_flagp, &acpipsmlnk);
psm_set_elcr(*pci_irqp, 1); /* set IRQ to PCI mode */
psm_set_elcr(*pci_irqp, 1); /* set IRQ to PCI mode */

UPPC_VERBOSE_IRQ((CE_CONT, "!uppc: [ACPI] "
"new irq %d for device %s, instance #%d\n",
@@ -927,7 +928,7 @@ uppc_translate_irq(dev_info_t *dip, int irqno)
/* FALLTHRU to common case - returning irqno */
} else {
/* non-PCI; assumes ISA-style edge-triggered */
psm_set_elcr(irqno, 0); /* set IRQ to ISA mode */
psm_set_elcr(irqno, 0); /* set IRQ to ISA mode */

UPPC_VERBOSE_IRQ((CE_CONT, "!uppc: non-pci,"
"irqno %d device %s instance %d\n", irqno,
@@ -70,13 +70,6 @@ int cmi_force_generic = 0;
*/
int cmi_panic_on_uncorrectable_error = 1;

#ifndef __xpv
/*
* Set to indicate whether we are able to enable cmci interrupt.
*/
int cmi_enable_cmci = 0;
#endif

/*
* Subdirectory (relative to the module search path) in which we will
* look for cpu modules.

0 comments on commit 918e0d9

Please sign in to comment.
You can’t perform that action at this time.