Skip to content

Commit

Permalink
Add hdfs_tmp_cleaner.pl and update README to reflect what it is
Browse files Browse the repository at this point in the history
  • Loading branch information
Travis Campbell committed May 23, 2012
1 parent eb370fa commit 8ed32a6
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 1 deletion.
10 changes: 9 additions & 1 deletion README.md
@@ -1,4 +1,12 @@
hadoop-scripts
==============

A group of scripts useful for managing a hadoop cluster.
A group of scripts useful for managing a hadoop cluster.

bin/
hdfs_tmp_cleaner.pl

A tool to automate cleaning out of the /tmp inside HDFS. This only
looks at the top-level directory structure for the file and directory
timestamps. It will not recursively descend within subdirectories of
/tmp.
158 changes: 158 additions & 0 deletions bin/hdfs_tmp_cleaner.pl
@@ -0,0 +1,158 @@
#!/usr/bin/perl
#===============================================================================
#
# FILE: hdfs_tmp_cleaner.pl
#
# USAGE: ./hdfs_tmp_cleaner.pl
# --rm-batch - disable interactive prompting
# --keep-days - number of days to keep in /tmp
#
# DESCRIPTION:
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Travis Campbell (), <hcoyote@ghostar.org>
# COMPANY:
# VERSION: 1.0
# CREATED: 05/14/12 17:16:57 CDT
# REVISION: ---
#===============================================================================

use strict;
use warnings;


use IO::File;
use Time::ParseDate;
use Getopt::Long;


my $opt_rm_batch;
my $opt_hdfs_path = "hdfs://localhost:9000";
my $opt_keep_days = 2;

GetOptions(
'rm-batch' => \$opt_rm_batch,
'keep-days=i' => \$opt_keep_days,
'help' => sub { print "$0
--rm-batch = delete stuff without prompting for cofnirmation
--keep-days = number of days back to keep (default: $opt_keep_days)
--help = see --help for more information\n\n";
exit;

},

);


# don't change the number of seconds in a day until proven that earth has slowed down
# due to friction. Mental note: check back in a few millenia.
my $DAYS_IN_SECONDS = 86400;
my $KEEP_DAYS = $opt_keep_days;

# Things we need to work in HDFS; let's also limit the deletions to /tmp for now.
my $path = "/tmp";
my $hadoop_cmd = "/usr/bin/hadoop";
my $hadoop_ls = "$hadoop_cmd fs -ls ";
my $hadoop_rmr = "$hadoop_cmd fs -rmr ";
my $hadoop_du = "$hadoop_cmd fs -du ";
my $hadoop_dus = "$hadoop_cmd fs -du -s ";


my $total_size = 0;
my $total_delete_size = 0;

my %hdfs_path_info;

# When is now and who knows it?
my $date = parsedate("now");

my $fh = IO::File->new("$hadoop_ls $path|") or die "Could not open $hadoop_ls: $!";
my $dufh = IO::File->new("$hadoop_du $path|") or die "Could not open $hadoop_du: $!";

# grab the directory size for reporting later.
while (<$dufh>) {
next if /^Found \d+ items/;

chomp;

my ($du_size, $path) = split(/\s+/);

$path =~ s/$opt_hdfs_path//;

$hdfs_path_info{$path}{du} = $du_size;
}


# grab the list of file/directory metadata.
while (<$fh>) {
# skip a header
next if /^Found \d+ items/;

# nom nom the \r\n's!
chomp;

my ($mode, $replicas, $user, $group, $filesize, $mod_date, $mod_time, $path) = split(/\s+/);

# give us seconds from epoch
my $file_time = parsedate("$mod_date $mod_time");

# get file age
my $dur = $date - $file_time;

# if we're looking at directories, we want the du size in our totals, otherwise, filesize.
if ($mode =~ /^d/) {
$total_size += $hdfs_path_info{$path}{du};
} else {
$total_size += $filesize;
}

# if our file/dir is older than our threshhold, nuke it from LEO.
if ($dur > ($KEEP_DAYS * $DAYS_IN_SECONDS)) {
print "$mod_date $mod_time $path is > $KEEP_DAYS (DELETE CANDIDATE => " . $hdfs_path_info{$path}{du} . " bytes)\n";
if ($mode =~ /^d/) {
$total_delete_size += $hdfs_path_info{$path}{du};
} else {
$total_delete_size += $filesize;
}

$hdfs_path_info{$path}{deleteme} = 1;

} else {
print "$mod_date $mod_time $path is < $KEEP_DAYS\n";
}


}

# Tell me what I'm about to delete so I know how much space will roughly free up in the DFS.
print "TOTAL DELETE CANDIDATES = " . $total_delete_size / (1024 * 1024 * 1024 ) . " gigabytes\n";
print "TOTAL SIZE = " . $total_size / (1024 * 1024 * 1024) . "gigabytes\n";

# let's actually work on deleting things.
foreach my $path (sort keys %hdfs_path_info) {
if (exists $hdfs_path_info{$path}{deleteme} and $hdfs_path_info{$path}{deleteme} == 1) {

# Go interactive unless we're deleting in batch mode.
if (not defined $opt_rm_batch) {
print "Ready to delete $path: ";

my $prompt = <STDIN>;
chomp $prompt;

next unless ($prompt =~ /y/i) ;
}

# Delete it and point some stuff out.
my $rmfh = IO::File->new("$hadoop_rmr $path|") or warn "Could not delete $path: $!";
while (<$rmfh>) {
if (/^Deleted/) {
print "Deletion of $path successful\n";
} else {
print ;
}
}
}
}

0 comments on commit 8ed32a6

Please sign in to comment.