Skip to content

Commit

Permalink
[syncd] partial warm recovery support (sonic-net#352)
Browse files Browse the repository at this point in the history
* [syncd] delay creating diag shell to after switch is created

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd] provide default SAI warmboot data file in case not specified

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd] move performWarmRestart to sycnd_hard_reinit.cpp

- Move function to get access to static local variables.
- Improve the function according to the new SAI interaction requirements.
- Remove 'TODO' and exception since the code is now working.

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd] restore previously changed warm boot condition

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd script] refactor syncd init script

- move start type setting in a helper function

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd script] check and set warm start type

WARM start takes precedence over fast start. Because fast boot option
is a kernel command line option. This option will always be there
after a fast reboot.

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd] comment out warm start option calculation for now

redis-cli is not available in syncd docker. Some work/discussion/decision
is required if we want to continue start option.

Signed-off-by: Ying Xie <ying.xie@microsoft.com>

* [syncd] take warm boot hint from database directly

Signed-off-by: Ying Xie <ying.xie@microsoft.com>
  • Loading branch information
yxieca authored and lguohan committed Oct 29, 2018
1 parent 4d9aa18 commit d655d20
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 89 deletions.
52 changes: 29 additions & 23 deletions syncd/scripts/syncd_init_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,31 @@ case "$(cat /proc/cmdline)" in
esac


function check_warm_boot()
{
# FIXME: if we want to continue start option approach, then we need to add
# code here to support redis database query.
# SYSTEM_WARM_START=`/usr/bin/redis-cli -n 4 hget "WARM_RESTART|system" enable`
# SERVICE_WARM_START=`/usr/bin/redis-cli -n 4 hget "WARM_RESTART|${SERVICE}" enable`
# SYSTEM_WARM_START could be empty, always make WARM_BOOT meaningful.
# if [[ x"$SYSTEM_WARM_START" == x"true" ]] || [[ x"$SERVICE_WARM_START" == x"true" ]]; then
# WARM_BOOT="true"
# else
WARM_BOOT="false"
# fi
}


function set_start_type()
{
if [ x"$WARM_BOOT" == x"true" ]; then
CMD_ARGS+=" -t warm"
elif [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}


config_syncd_bcm()
{
if [ -f "/etc/sai.d/sai.profile" ]; then
Expand All @@ -45,10 +70,6 @@ config_syncd_bcm()
[ -e /dev/linux-bcm-knet ] || mknod /dev/linux-bcm-knet c 122 0
[ -e /dev/linux-user-bde ] || mknod /dev/linux-user-bde c 126 0
[ -e /dev/linux-kernel-bde ] || mknod /dev/linux-kernel-bde c 127 0

if [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}

config_syncd_mlnx()
Expand All @@ -66,10 +87,6 @@ config_syncd_mlnx()
# Write MAC address into /tmp/profile file.
cat $HWSKU_DIR/sai.profile > /tmp/sai.profile
echo "DEVICE_MAC_ADDRESS=$ALIGNED_MAC_ADDRESS" >> /tmp/sai.profile

if [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}

config_syncd_centec()
Expand All @@ -78,10 +95,6 @@ config_syncd_centec()

[ -e /dev/linux_dal ] || mknod /dev/linux_dal c 198 0
[ -e /dev/net/tun ] || ( mkdir -p /dev/net && mknod /dev/net/tun c 10 200 )

if [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}

config_syncd_cavium()
Expand All @@ -94,10 +107,6 @@ config_syncd_cavium()
until [ $(redis-cli ping | grep -c PONG) -gt 0 ]; do
sleep 1
done

if [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}

config_syncd_marvell()
Expand All @@ -120,22 +129,17 @@ config_syncd_barefoot()
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/bfn/install/lib/platform/$ONIE_PLATFORM:/opt/bfn/install/lib:/opt/bfn/install/lib/tofinopd/switch
./opt/bfn/install/bin/dma_setup.sh
export LD_PRELOAD=libswitchapi.so:libswitchsai.so:libpd.so:libpdcli.so:libdriver.so:libbfsys.so:libbfutils.so:libbf_switchd_lib.so:libtofinopdfixed_thrift.so:libpdthrift.so

if [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}

config_syncd_nephos()
{
CMD_ARGS+=" -p $HWSKU_DIR/sai.profile"
if [ $FAST_REBOOT == "yes" ]; then
CMD_ARGS+=" -t fast"
fi
}

config_syncd()
{
check_warm_boot

if [ "$SONIC_ASIC_TYPE" == "broadcom" ]; then
config_syncd_bcm
elif [ "$SONIC_ASIC_TYPE" == "mellanox" ]; then
Expand All @@ -155,6 +159,8 @@ config_syncd()
exit 1
fi

set_start_type

if [ ${ENABLE_SAITHRIFT} == 1 ]; then
CMD_ARGS+=" -r -m $HWSKU_DIR/port_config.ini"
fi
Expand Down
84 changes: 20 additions & 64 deletions syncd/syncd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "swss/tokenize.h"
#include <limits.h>

#include "swss/warm_restart.h"

extern "C" {
#include <sai.h>
}
Expand All @@ -13,6 +15,8 @@ extern "C" {
#include <map>
#include <unordered_map>

#define DEF_SAI_WARM_BOOT_DATA_FILE "/var/warmboot/sai-warmboot.bin"

/**
* @brief Global mutex for thread synchronization
*
Expand Down Expand Up @@ -3059,6 +3063,13 @@ void handleProfileMap(const std::string& profileMapFile)
exit(EXIT_FAILURE);
}

// Provide default value at boot up time and let sai profile value
// Override following values if existing.
// SAI reads these values at start up time. It would be too late to
// set these values later when WARM BOOT is detected.
gProfileMap[SAI_KEY_WARM_BOOT_WRITE_FILE] = DEF_SAI_WARM_BOOT_DATA_FILE;
gProfileMap[SAI_KEY_WARM_BOOT_READ_FILE] = DEF_SAI_WARM_BOOT_DATA_FILE;

std::string line;

while(getline(profile, line))
Expand Down Expand Up @@ -3288,61 +3299,6 @@ void set_sai_api_log_min_prio(const std::string &prioStr)
}
}

void performWarmRestart()
{
SWSS_LOG_ENTER();

/*
* There should be no case when we are doing warm restart and there is no
* switch defined, we will throw at sucha case.
*
* This case could be possible when no switches were created and only api
* was initialized, but we will skip this scenario and address is when we
* will have need for it.
*/

auto entries = g_redisClient->keys(ASIC_STATE_TABLE + std::string(":SAI_OBJECT_TYPE_SWITCH:*"));

if (entries.size() == 0)
{
SWSS_LOG_THROW("on warm restart there is no switches defined in DB, not supported yet, FIXME");
}

if (entries.size() != 1)
{
SWSS_LOG_THROW("multiple switches defined in warm start: %zu, not supported yet, FIXME", entries.size());
}

/*
* Here wa have only one switch defined, let's extract his vid and rid.
*/

/*
* Entry should be in format ASIC_STATE:SAI_OBJECT_TYPE_SWITCH:oid:0xYYYY
*
* Let's extract oid value
*/

std::string key = entries.at(0);

auto start = key.find_first_of(":") + 1;
auto end = key.find(":", start);

std::string strSwitchVid = key.substr(end + 1);

sai_object_id_t switch_vid;

sai_deserialize_object_id(strSwitchVid, switch_vid);

sai_object_id_t switch_rid = translate_vid_to_rid(switch_vid);

/*
* Perform all get operations on existing switch.
*/

switches[switch_vid] = std::make_shared<SaiSwitch>(switch_vid, switch_rid);
}

void onSyncdStart(bool warmStart)
{
SWSS_LOG_ENTER();
Expand Down Expand Up @@ -3377,14 +3333,6 @@ void onSyncdStart(bool warmStart)
performWarmRestart();

SWSS_LOG_NOTICE("skipping hard reinit since WARM start was performed");

// TODO issue here can be that in hard start there was 8 queues then
// user added 2, and we have 10, after warm restart, switch will
// discover 10 queus, and mark them as "non removable" but 2 of them
// can be removed. We would probably need to store all objects after
// hard reinit and treat that as base.

SWSS_LOG_THROW("warm restart is not yet fully supported and needs to be revisited");
return;
}

Expand Down Expand Up @@ -3466,6 +3414,9 @@ int syncd_main(int argc, char **argv)

swss::Logger::linkToDbNative("syncd");

swss::WarmStart::initialize("syncd", "syncd");
swss::WarmStart::checkWarmStart("syncd");

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
sai_metadata_log = &sai_meta_log_syncd;
Expand Down Expand Up @@ -3506,6 +3457,11 @@ int syncd_main(int argc, char **argv)

g_veryFirstRun = isVeryFirstRun();

if (swss::WarmStart::isWarmStart())
{
options.startType = SAI_WARM_BOOT;
}

if (options.startType == SAI_WARM_BOOT)
{
const char *warmBootReadFile = profile_get_value(0, SAI_KEY_WARM_BOOT_READ_FILE);
Expand Down Expand Up @@ -3572,7 +3528,7 @@ int syncd_main(int argc, char **argv)
try
{
SWSS_LOG_NOTICE("before onSyncdStart");
onSyncdStart(false);
onSyncdStart(options.startType == SAI_WARM_BOOT);
SWSS_LOG_NOTICE("after onSyncdStart");

startNotificationsProcessingThread();
Expand Down
4 changes: 4 additions & 0 deletions syncd/syncd.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ void startDiagShell();

void hardReinit();

void performWarmRestart();

sai_object_id_t translate_vid_to_rid(_In_ sai_object_id_t vid);

void redisClearVidToRidMap();
void redisClearRidToVidMap();

Expand Down
86 changes: 84 additions & 2 deletions syncd/syncd_hard_reinit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,6 @@ void processSwitches()
g_translatedV2R[switch_vid] = switch_rid;
g_translatedR2V[switch_rid] = switch_vid;

startDiagShell();

auto sw = switches[switch_vid] = std::make_shared<SaiSwitch>(switch_vid, switch_rid);

/*
Expand All @@ -458,6 +456,8 @@ void processSwitches()

g_sw = sw;

startDiagShell();

/*
* We processed switch. We have switch vid/rid so we can process all
* other attributes of switches that are not mandatory on create and are
Expand Down Expand Up @@ -1203,3 +1203,85 @@ void hardReinit()

checkAllIds();
}

void performWarmRestart()
{
SWSS_LOG_ENTER();

/*
* There should be no case when we are doing warm restart and there is no
* switch defined, we will throw at sucha case.
*
* This case could be possible when no switches were created and only api
* was initialized, but we will skip this scenario and address is when we
* will have need for it.
*/

auto entries = g_redisClient->keys(ASIC_STATE_TABLE + std::string(":SAI_OBJECT_TYPE_SWITCH:*"));

if (entries.size() == 0)
{
SWSS_LOG_THROW("on warm restart there is no switches defined in DB, not supported yet, FIXME");
}

if (entries.size() != 1)
{
SWSS_LOG_THROW("multiple switches defined in warm start: %zu, not supported yet, FIXME", entries.size());
}

/*
* Here wa have only one switch defined, let's extract his vid and rid.
*/

/*
* Entry should be in format ASIC_STATE:SAI_OBJECT_TYPE_SWITCH:oid:0xYYYY
*
* Let's extract oid value
*/

std::string key = entries.at(0);

auto start = key.find_first_of(":") + 1;
auto end = key.find(":", start);

std::string strSwitchVid = key.substr(end + 1);

sai_object_id_t switch_vid;

sai_deserialize_object_id(strSwitchVid, switch_vid);

sai_object_id_t orig_rid = translate_vid_to_rid(switch_vid);

sai_object_id_t switch_rid;
sai_attribute_t switch_attr;
switch_attr.id = SAI_SWITCH_ATTR_INIT_SWITCH;
switch_attr.value.booldata = true;
sai_status_t status = sai_metadata_sai_switch_api->create_switch(&switch_rid, 1, &switch_attr);

if (status != SAI_STATUS_SUCCESS)
{
SWSS_LOG_THROW("failed to create switch RID: %s",
sai_serialize_status(status).c_str());
}
if (orig_rid != switch_rid)
{
SWSS_LOG_THROW("Unexpected RID 0x%lx (expected 0x%lx)",
switch_rid, orig_rid);
}

g_translatedV2R[switch_vid] = switch_rid;
g_translatedR2V[switch_rid] = switch_vid;

/*
* Perform all get operations on existing switch.
*/

auto sw = switches[switch_vid] = std::make_shared<SaiSwitch>(switch_vid, switch_rid);

g_switch_rid = switch_rid;
g_switch_vid = switch_vid;

g_sw = sw;

startDiagShell();
}

0 comments on commit d655d20

Please sign in to comment.