Skip to content

Commit

Permalink
ZOOKEEPER-2307: ZooKeeper not starting because acceptedEpoch is less …
Browse files Browse the repository at this point in the history
…than the currentEpoch

Update acceptedEpoch and currentEpoch in file first then in memory.

Author: Mohammad Arshad <arshad@apache.org>

Reviewers: andor@apache.org

Closes apache#1145 from arshadmohammad/ZOOKEEPER-2307-epochUpdate and squashes the following commits:

b05bc1f [Mohammad Arshad] review comment fix
c8d620f [Mohammad Arshad] ZOOKEEPER-2307:ZooKeeper not starting because acceptedEpoch is less than the currentEpoch
  • Loading branch information
arshadmohammad authored and junyoungKimGit committed Feb 7, 2020
1 parent 980df88 commit f5cd37a
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2074,7 +2074,8 @@ private long readLongFromFile(String name) throws IOException {
* @param value the long value to write to the named file
* @throws IOException if the file cannot be written atomically
*/
private void writeLongToFile(String name, final long value) throws IOException {
// visibleForTest
void writeLongToFile(String name, final long value) throws IOException {
File file = new File(logFactory.getSnapDir(), name);
new AtomicFileWritingIdiom(file, new WriterStatement() {
@Override
Expand All @@ -2099,14 +2100,14 @@ public long getAcceptedEpoch() throws IOException {
}

public void setCurrentEpoch(long e) throws IOException {
currentEpoch = e;
writeLongToFile(CURRENT_EPOCH_FILENAME, e);
currentEpoch = e;

}

public void setAcceptedEpoch(long e) throws IOException {
acceptedEpoch = e;
writeLongToFile(ACCEPTED_EPOCH_FILENAME, e);
acceptedEpoch = e;
}

public boolean processReconfig(QuorumVerifier qv, Long suggestedLeaderId, Long zxid, boolean restartLE) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.zookeeper.server.quorum;

import static org.apache.zookeeper.test.ClientBase.CONNECTION_TIMEOUT;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.PortAssignment;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.ZooKeeper;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.ClientBase.CountdownWatcher;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Test;

public class EpochWriteFailureTest extends QuorumPeerTestBase {
private static int SERVER_COUNT = 3;
private static int[] clientPorts = new int[SERVER_COUNT];
private static MainThread[] mt = new MainThread[SERVER_COUNT];
private static ZooKeeper zk;

/*
* Test case for https://issues.apache.org/jira/browse/ZOOKEEPER-2307
* Expectation: During leader election when accepted epoch write to file
* fails, it should not complete leader election, also it should not update
* run time values of acceptedEpoch,
*/
@Test(timeout = 120000)
public void testAcceptedEpochWriteFailure() throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("admin.enableServer=false");
sb.append("\n");
String server;
for (int i = 0; i < SERVER_COUNT; i++) {
clientPorts[i] = PortAssignment.unique();
server = "server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":"
+ PortAssignment.unique() + ":participant;127.0.0.1:" + clientPorts[i];
sb.append(server);
sb.append("\n");
}
String currentQuorumCfgSection = sb.toString();
for (int i = 0; i < SERVER_COUNT - 1; i++) {
mt[i] = new MainThread(i, clientPorts[i], currentQuorumCfgSection, false);
mt[i].start();
}

// ensure two servers started
for (int i = 0; i < SERVER_COUNT - 1; i++) {
Assert.assertTrue("waiting for server " + i + " being up",
ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[i], CONNECTION_TIMEOUT));
}

CountdownWatcher watch1 = new CountdownWatcher();
zk = new ZooKeeper("127.0.0.1:" + clientPorts[0], ClientBase.CONNECTION_TIMEOUT,
watch1);
watch1.waitForConnected(ClientBase.CONNECTION_TIMEOUT);

String data = "originalData";
zk.create("/epochIssue", data.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);

//initialize third server
mt[2] = new MainThread(2, clientPorts[2], currentQuorumCfgSection, false) {

@Override
public TestQPMain getTestQPMain() {
return new MockTestQPMain();
}
};

//This server has problem it fails while writing acceptedEpoch.
mt[2].start();

/*
* Verify that problematic server does not start as acceptedEpoch update
* failure is injected and it keeps on trying to join the quorum
*/

Assert.assertFalse("verify server 2 not started",
ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[2], CONNECTION_TIMEOUT / 2));

QuorumPeer quorumPeer = mt[2].getQuorumPeer();

Assert.assertEquals("acceptedEpoch must not have changed", 0,
quorumPeer.getAcceptedEpoch());
Assert.assertEquals("currentEpoch must not have changed", 0,
quorumPeer.getCurrentEpoch());
}

static class CustomQuorumPeer extends QuorumPeer {
CustomQuorumPeer(Map<Long, QuorumServer> quorumPeers, File snapDir, File logDir, int clientPort,
int electionAlg, long myid, int tickTime, int initLimit, int syncLimit,
int connectToLearnerMasterLimit) throws IOException {
super(quorumPeers, snapDir, logDir, clientPort, electionAlg, myid, tickTime, initLimit, syncLimit, connectToLearnerMasterLimit);
}

@Override
protected void writeLongToFile(String name, long value) throws IOException {
// initial epoch writing should be successful
if (0 != value) {
throw new IOException("Input/output error");
}
}
}

private static class MockTestQPMain extends TestQPMain {
@Override
public void runFromConfig(QuorumPeerConfig config)
throws IOException {
quorumPeer = new CustomQuorumPeer(config.getQuorumVerifier().getAllMembers(),
config.getDataDir(), config.getDataLogDir(),
config.getClientPortAddress().getPort(), config.getElectionAlg(),
config.getServerId(), config.getTickTime(), config.getInitLimit(),
config.getSyncLimit(), config.getSyncLimit());
quorumPeer.start();
try {
quorumPeer.join();
} catch (InterruptedException e) {
LOG.warn("Quorum Peer interrupted", e);
}
}
}

@AfterClass
public static void tearDownAfterClass() throws InterruptedException {
for (int i = 0; i < SERVER_COUNT; i++) {
if (mt[i] != null) {
mt[i].shutdown();
}
}
if (zk != null) {
zk.close();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
import org.apache.zookeeper.server.util.ZxidUtils;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.test.TestUtils;
import org.apache.zookeeper.txn.CreateSessionTxn;
import org.apache.zookeeper.txn.CreateTxn;
Expand Down Expand Up @@ -852,7 +853,7 @@ public void converseWithLeader(InputArchive ia, OutputArchive oa, Leader l) thro
assertEquals(Leader.NEWLEADER, qp.getType());
assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid());
assertEquals(1, l.self.getAcceptedEpoch());
assertEquals(1, l.self.getCurrentEpoch());
assertCurrentEpochGotUpdated(1, l.self, ClientBase.CONNECTION_TIMEOUT);

qp = new QuorumPacket(Leader.ACK, qp.getZxid(), null, null);
oa.writeRecord(qp, null);
Expand Down Expand Up @@ -893,7 +894,7 @@ public void converseWithLeader(InputArchive ia, OutputArchive oa, Leader l) thro
assertEquals(Leader.NEWLEADER, qp.getType());
assertEquals(ZxidUtils.makeZxid(1, 0), qp.getZxid());
assertEquals(1, l.self.getAcceptedEpoch());
assertEquals(1, l.self.getCurrentEpoch());
assertCurrentEpochGotUpdated(1, l.self, ClientBase.CONNECTION_TIMEOUT);

qp = new QuorumPacket(Leader.ACK, qp.getZxid(), null, null);
oa.writeRecord(qp, null);
Expand Down Expand Up @@ -1212,4 +1213,22 @@ public void testInitialAcceptedCurrent() throws Exception {
}
}

/*
* Epoch is first written to file then updated in memory. Give some time to
* write the epoch in file and then go for assert.
*/
private void assertCurrentEpochGotUpdated(int expected, QuorumPeer self, long timeout)
throws IOException {
long elapsedTime = 0;
long waitInterval = 10;
while (self.getCurrentEpoch() != expected && elapsedTime < timeout) {
try {
Thread.sleep(waitInterval);
} catch (InterruptedException e) {
fail("CurrentEpoch update failed");
}
elapsedTime = elapsedTime + waitInterval;
}
assertEquals("CurrentEpoch update failed", expected, self.getCurrentEpoch());
}
}

0 comments on commit f5cd37a

Please sign in to comment.