-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[HWKMETRICS-780] fix temp tables not getting dropped (#928)
* [HWKMETRICS-763] fix bug in scheduler error handling I found the bug today which is in SchedulerImpl.java. At some point a while back I added retry support for when jobs fails. You provide a RetryPolicy that determines if and when a job should be retried. This was primarily intended for non-repeating, single-execution jobs. For repeating jobs, i.e., the compression job, the job is supposed to be executed according to its trigger. Suppose the compression job is scheduled to run at 15:00 and repeats every two hours. If the 15:00 execution fails, the scheduler is supposed to retry the job (for all intensive purposes) immediately with the same trigger time of 15:00. When the job completes normally, its trigger will get updated in the database to the next execution time of 17:00. If it already later than 17:00, then the scheduler will again execute the job right away. There was a bug with the error handling such that the job wasn't getting retried for the 15:00 execution. The trigger was getting advanced and set to 17:00. The temp table being compressed gets dropped at the end of the compression job. Unless the failure was dropping the table, we end up with orphaned temp tables any time the job fails. This commit does away with the RetryPolicy since it is not used. This means that when a job fails, regardless of whether or not it is repeating, it will be retried with the same trigger until it completes normally. * [HWKMETRICS-763] add "local" job to check for expired temp tables Due to the bug in SchedulerImpl.java which was fixed in my previous commit, there are openshift clusters with literally hundreds of expired temp tables. They are expired in the sense that they are older than the data retention and therefore do not contain any live data. This commit adds a "local" job that checks for and drops expired tables. The job is local in the sense that it is local to the hawkular-metrics server. It is not run via the job scheduler. * [HWKMETRICS-763] use Observable.empty to get rid of filter call
- Loading branch information
Showing
8 changed files
with
197 additions
and
176 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 114 additions & 0 deletions
114
...trics-core-service/src/main/java/org/hawkular/metrics/core/service/TempTablesCleaner.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
/* | ||
* Copyright 2014-2018 Red Hat, Inc. and/or its affiliates | ||
* and other contributors as indicated by the @author tags. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.hawkular.metrics.core.service; | ||
|
||
import java.util.concurrent.TimeUnit; | ||
|
||
import org.hawkular.rx.cassandra.driver.RxSession; | ||
import org.jboss.logging.Logger; | ||
import org.joda.time.Days; | ||
|
||
import com.datastax.driver.core.PreparedStatement; | ||
|
||
import rx.Observable; | ||
import rx.schedulers.Schedulers; | ||
|
||
/** | ||
* @author jsanda | ||
*/ | ||
public class TempTablesCleaner { | ||
|
||
private static Logger logger = Logger.getLogger(TempTablesCleaner.class); | ||
|
||
private RxSession session; | ||
|
||
private PreparedStatement findTables; | ||
|
||
private long ttl; | ||
|
||
private DataAccessImpl dataAccess; | ||
|
||
private volatile boolean finished; | ||
|
||
private static final String DROP_TABLE_CQL = "DROP TABLE IF EXISTS %s"; | ||
|
||
public TempTablesCleaner(RxSession session, DataAccessImpl dataAccess, String keyspace, int ttl) { | ||
this.session = session; | ||
this.dataAccess = dataAccess; | ||
this.ttl = Days.days(ttl).toStandardDuration().getMillis(); | ||
|
||
findTables = session.getSession().prepare( | ||
"SELECT table_name FROM system_schema.tables WHERE keyspace_name = '" + keyspace + "'"); | ||
} | ||
|
||
public void run() { | ||
logger.info("Checking for expired temp tables"); | ||
Observable.interval(1, TimeUnit.DAYS, Schedulers.io()) | ||
.takeUntil(i -> finished) | ||
.flatMap(i -> session.execute(findTables.bind())) | ||
.compose(applyRetryPolicy()) | ||
.flatMap(Observable::from) | ||
.filter(row -> row.getString(0).startsWith(DataAccessImpl.TEMP_TABLE_NAME_PROTOTYPE)) | ||
.map(row -> row.getString(0)) | ||
.filter(this::isTableExpired) | ||
.flatMap(this::dropTable) | ||
.subscribe( | ||
table -> logger.infof("Dropped table %s", table), | ||
t -> logger.warn("Cleaning temp tables failed", t), | ||
() -> logger.infof("Finished cleaning expired temp tables") | ||
); | ||
|
||
} | ||
|
||
public void shutdown() { | ||
finished = true; | ||
} | ||
|
||
private <T> Observable.Transformer<T, T> applyRetryPolicy() { | ||
return tObservable -> tObservable | ||
.retryWhen(observable -> { | ||
Integer maxRetries = Integer.getInteger("hawkular.metrics.temp-table-cleaner.max-retries", 10); | ||
Integer maxDelay = Integer.getInteger("hawkular.metrics.temp-table-cleaner.max-delay", 300); | ||
Observable<Integer> range = Observable.range(1, maxRetries); | ||
Observable<Observable<?>> zipWith = observable.zipWith(range, (t, i) -> { | ||
int delay = Math.min((int) Math.pow(2, i), maxDelay); | ||
logger.debugf(t, "The findTables query failed. Attempting retry # %d seconds", delay); | ||
return Observable.timer(delay, TimeUnit.SECONDS).onBackpressureDrop(); | ||
}); | ||
|
||
return Observable.merge(zipWith); | ||
}); | ||
} | ||
|
||
private boolean isTableExpired(String table) { | ||
Long timestamp = dataAccess.tableToMapKey(table); | ||
return timestamp < (System.currentTimeMillis() - ttl); | ||
} | ||
|
||
private Observable<String> dropTable(String table) { | ||
return session.execute(String.format(DROP_TABLE_CQL, table)) | ||
.map(resultSet -> table) | ||
.onErrorResumeNext(t -> { | ||
// If there is an error, we do not retry because it is possible that the table has already been | ||
// dropped. We will instead wait until findTables runs again and retry dropping the table then | ||
// if dropping it did indeed fail for some reason. | ||
logger.infof(t, "Failed to drop %s", table); | ||
return Observable.empty(); | ||
}); | ||
} | ||
|
||
} |
33 changes: 0 additions & 33 deletions
33
job-scheduler/src/main/java/org/hawkular/metrics/scheduler/api/RetryPolicy.java
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.