Skip to content

Commit

Permalink
Run in infinite loop and notify via email
Browse files Browse the repository at this point in the history
Update this so we can notify someone of tasks that have been killed (or general auth problems).

Also, run in an infinite loop so we can get this to run under supervisord, allowing us to daemonize this.
  • Loading branch information
hcoyote committed Feb 18, 2015
1 parent f819945 commit f50bf34
Showing 1 changed file with 64 additions and 42 deletions.
106 changes: 64 additions & 42 deletions bin/mapred_find_stuck_tasks
@@ -1,16 +1,4 @@
#!/bin/bash
#
# This script attempts to find stuck MRv1 jobs running in an HA
# Hadoop cluster using job recovery, looking for jobs with tasks throwing
# "Error launching task" errors.
#
# Details on this issue can be found at https://issues.cloudera.org/browse/DISTRO-485
#
# This script should not be needed if you're running at least
#
# CDH 4.7
# CDH 5.0.1
# CDH 5.1.0

# Determine if we're kerberos enabled and attempt to find a working
# ticket before we start using hadoop commands, otherwise this just
Expand All @@ -31,6 +19,10 @@ fi
domain=$(facter domain 2>/dev/null)
cluster=$(hadoop fs -cat /CLUSTERNAME 2> /dev/null | grep ^cluster: | cut -f2 -d' ' || echo 'unknown')

# script defaults
sleep=900
notify_to=root

show_help() {
cat <<EOF
Expand All @@ -48,6 +40,8 @@ show_help() {
-s check for kerberos key first
Other
-t sleep period between checks: default $sleep
-n send email notification : default $notify_to
-v verbose
-? help
-h help
Expand All @@ -56,7 +50,7 @@ EOF


OPTIND=1
while getopts "ksvc:d:" opt ; do
while getopts "ksvc:d:n:t:" opt ; do
case "$opt" in
c) cluster=$OPTARG
;;
Expand All @@ -69,6 +63,8 @@ while getopts "ksvc:d:" opt ; do
;;
v) verbose=1
;;
t) sleep=$OPTARG
;;
'?'|h) show_help >&2
exit 1
;;
Expand All @@ -78,46 +74,72 @@ while getopts "ksvc:d:" opt ; do
done
shift "$((OPTIND-1))" # shift off the options and optional --

klist_test() {
klist -s
if [[ $? -gt 0 ]] ; then
mail -s "$0 kerberos authentication isn't working. ticket expired; k5start dead for $USER?" "$notify_to" < /dev/null > /dev/null 2>&1
exit 1
fi
}

#
if [[ -z $cluster ]] ; then
echo "Could not determine the clustername for this host"
exit 1
fi

for i in jobtracker1 jobtracker2 ; do
jt=`hadoop mrhaadmin -getServiceState $i 2>/dev/null`
case $jt in
active) active_jt=$i
break
;;
*);;
esac
done
while : ; do

job_list=$(hadoop job -list 2>/dev/null | grep job_ | cut -f1)

for job in $job_list ; do
url="http://${active_jt}.${cluster}.hdp.${domain}:50030/jobtasks.jsp?jobid=$job&type=map&pagenum=1&state=running"
run_status=$(mktemp /tmp/mapred_find_stuck_tasks.XXXXXXX) || exit 1

if [[ $verbose ]] ; then echo Attempting to query $job at $url; fi
curl -s "$url" | grep -q "Error launching task"
for i in jobtracker1 jobtracker2 ; do
klist_test

# save off the pipe status so we can re-use it a few times
status=(${PIPESTATUS[@]})
curl_status=${status[0]}
grep_status=${status[1]}
jt=`hadoop mrhaadmin -getServiceState $i 2>/dev/null`
case $jt in
active) active_jt=$i
break
;;
*);;
esac
done

if [[ "${curl_status}" -gt 0 ]] ; then
echo "Curl failed for status of $job; exit ${PIPESTATUS[0]}"
continue
fi
job_list=$(hadoop job -list 2>/dev/null | grep job_ | cut -f1)

for job in $job_list ; do
for type in map reduce ; do
url="http://${active_jt}.${cluster}.hdp.${domain}:50030/jobtasks.jsp?jobid=$job&type=$type&pagenum=1&state=running"

if [[ $verbose ]] ; then echo Attempting to query $job task type $type at $url >> "$run_status"; fi
curl -s "$url" | grep -q "Error launching task"

if [[ "${grep_status}" -eq 0 ]] ; then
echo "$job: errors launching tasks"
if [[ -n "${kill_jobs}" ]] ; then
echo "Attempting to kill $job"
hadoop job -kill $job
fi
# save off the pipe status so we can re-use it a few times
status=(${PIPESTATUS[@]})
curl_status=${status[0]}
grep_status=${status[1]}

if [[ "${curl_status}" -gt 0 ]] ; then
echo "Curl failed for status of $job type $type; exit ${PIPESTATUS[0]}" >> "$run_status"
continue
fi

if [[ "${grep_status}" -eq 0 ]] ; then
echo "$job: errors launching tasks" >> "$run_status"
if [[ -n "${kill_jobs}" ]] ; then
echo "Attempting to kill $job" >> "$run_status"
klist_test
hadoop job -kill $job >> "$run_status" 2>&1
fi
fi
done
done

if grep -q "errors launching tasks" "$run_status" ; then
mail -s "$0 killed tasks" "$notify_to" < "$run_status" > /dev/null 2>&1
fi
done

find /tmp -maxdepth 1 -name "mapred_find_stuck_tasks.*" -type f -mtime +1 -print0 | xargs --no-run-if-empty -0 rm -f

sleep $sleep
done

0 comments on commit f50bf34

Please sign in to comment.