Permalink
Browse files

initial slab automover

Enable at startup with -o slab_reassign,slab_automove

Enable or disable at runtime with "slabs automove 1\r\n"

Has many weaknesses. Only pulls from slabs which have had zero recent
evictions. Is slow, not tunable, etc. Use the scripts/mc_slab_mover example to
write your own external automover if this doesn't satisfy.
  • Loading branch information...
1 parent 10698ba commit 99fc043af90170757a34802bd96111999013d0bc @dormando dormando committed Jan 4, 2012
Showing with 402 additions and 36 deletions.
  1. +9 −0 items.c
  2. +1 −0 items.h
  3. +60 −34 memcached.c
  4. +260 −0 scripts/mc_slab_mover
  5. +72 −2 slabs.c
View
9 items.c
@@ -398,6 +398,15 @@ char *do_item_cachedump(const unsigned int slabs_clsid, const unsigned int limit
return buffer;
}
+void item_stats_evictions(uint64_t *evicted) {
+ int i;
+ mutex_lock(&cache_lock);
+ for (i = 0; i < LARGEST_ID; i++) {
+ evicted[i] = itemstats[i].evicted;
+ }
+ pthread_mutex_unlock(&cache_lock);
+}
+
void do_item_stats(ADD_STAT add_stats, void *c) {
int i;
for (i = 0; i < LARGEST_ID; i++) {
View
1 items.h
@@ -24,3 +24,4 @@ item *do_item_get(const char *key, const size_t nkey, const uint32_t hv);
item *do_item_touch(const char *key, const size_t nkey, uint32_t exptime, const uint32_t hv);
void item_stats_reset(void);
extern pthread_mutex_t cache_lock;
+void item_stats_evictions(uint64_t *evicted);
View
94 memcached.c
@@ -3189,6 +3189,26 @@ static void process_verbosity_command(conn *c, token_t *tokens, const size_t nto
return;
}
+static void process_slabs_automove_command(conn *c, token_t *tokens, const size_t ntokens) {
+ unsigned int level;
+
+ assert(c != NULL);
+
+ set_noreply_maybe(c, tokens, ntokens);
+
+ level = strtoul(tokens[2].value, NULL, 10);
+ if (level == 0) {
+ settings.slab_automove = false;
+ } else if (level == 1) {
+ settings.slab_automove = true;
+ } else {
+ out_string(c, "ERROR");
+ return;
+ }
+ out_string(c, "OK");
+ return;
+}
+
static void process_command(conn *c, char *command) {
token_t tokens[MAX_TOKENS];
@@ -3303,45 +3323,51 @@ static void process_command(conn *c, char *command) {
conn_set_state(c, conn_closing);
- } else if (ntokens == 5 && (strcmp(tokens[COMMAND_TOKEN].value, "slabs") == 0 &&
- strcmp(tokens[COMMAND_TOKEN + 1].value, "reassign") == 0)) {
- int src, dst, rv;
+ } else if (strcmp(tokens[COMMAND_TOKEN].value, "slabs") == 0) {
+ if (ntokens == 5 && strcmp(tokens[COMMAND_TOKEN + 1].value, "reassign") == 0) {
+ int src, dst, rv;
- if (settings.slab_reassign == false) {
- out_string(c, "CLIENT_ERROR slab reassignment disabled");
- return;
- }
+ if (settings.slab_reassign == false) {
+ out_string(c, "CLIENT_ERROR slab reassignment disabled");
+ return;
+ }
- src = strtol(tokens[2].value, NULL, 10);
- dst = strtol(tokens[3].value, NULL, 10);
+ src = strtol(tokens[2].value, NULL, 10);
+ dst = strtol(tokens[3].value, NULL, 10);
- if (errno == ERANGE) {
- out_string(c, "CLIENT_ERROR bad command line format");
- return;
- }
+ if (errno == ERANGE) {
+ out_string(c, "CLIENT_ERROR bad command line format");
+ return;
+ }
- rv = slabs_reassign(src, dst);
- switch (rv) {
- case REASSIGN_OK:
- out_string(c, "OK");
- break;
- case REASSIGN_RUNNING:
- out_string(c, "BUSY");
- break;
- case REASSIGN_BADCLASS:
- out_string(c, "BADCLASS");
- break;
- case REASSIGN_NOSPARE:
- out_string(c, "NOSPARE");
- break;
- case REASSIGN_DEST_NOT_FULL:
- out_string(c, "NOTFULL");
- break;
- case REASSIGN_SRC_NOT_SAFE:
- out_string(c, "UNSAFE");
- break;
+ rv = slabs_reassign(src, dst);
+ switch (rv) {
+ case REASSIGN_OK:
+ out_string(c, "OK");
+ break;
+ case REASSIGN_RUNNING:
+ out_string(c, "BUSY");
+ break;
+ case REASSIGN_BADCLASS:
+ out_string(c, "BADCLASS");
+ break;
+ case REASSIGN_NOSPARE:
+ out_string(c, "NOSPARE");
+ break;
+ case REASSIGN_DEST_NOT_FULL:
+ out_string(c, "NOTFULL");
+ break;
+ case REASSIGN_SRC_NOT_SAFE:
+ out_string(c, "UNSAFE");
+ break;
+ }
+ return;
+ } else if (ntokens == 4 &&
+ (strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) {
+ process_slabs_automove_command(c, tokens, ntokens);
+ } else {
+ out_string(c, "ERROR");
}
- return;
} else if ((ntokens == 3 || ntokens == 4) && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) {
process_verbosity_command(c, tokens, ntokens);
} else {
View
260 scripts/mc_slab_mover
@@ -0,0 +1,260 @@
+#! /usr/bin/perl
+# See memcached for LICENSE
+# Copyright 2011 Dormando (dormando@rydia.net)
+
+=head1 NAME
+
+mc_slab_mover -- example utility for slab page reassignment for memcached
+
+=head1 SYNOPSIS
+
+ $ mc_slab_mover --host="127.0.0.1:11211" --verbose
+ $ mc_slab_mover --host="127.0.0.1:11211" --automove
+ $ mc_slab_mover --host="127.0.0.1:11211" --sleep=60 --loops=4 --automove
+
+=head1 DESCRIPTION
+
+This utility is an example implementation of an algorithm for reassigning
+slab memory in a running memcached instance. If memcached's built-in
+automover isn't working for you, you may use this script as an example
+base and expand on it. We welcome modifications or alternatives on the
+mailing list.
+
+=head1 ALGORITHM
+
+The default algorithm is simple, and may serve for a common case: over
+time one slab may grow in use compare to others, and as evictions stop
+in one slab and start in another it will reassign memory.
+
+If a slab has the most evictions three times in a row, it will pull a page
+from a slab which has had zero evictions three times in a row.
+
+There are many traffic patterns where this does not work well. IE: If you
+never use expirations and rely on the LRU (so all slabs always evict),
+it will not be as likely to find source pages to move.
+
+=head1 OPTIONS
+
+=over
+
+=item --host="IP:PORT"
+
+The hostname to connect to. NOTE: If connection to the host breaks, script
+will stop.
+
+=item --sleep=10
+
+How long to wait between loops for gathering stats.
+
+=item --loops=3
+
+How many loops to run before making a decision for a move.
+
+=item --verbose
+
+Prints a formatted dump of some common statistics per loop.
+
+=item --automove
+
+Enables the automover, and will attempt to move memory around if it finds
+viable candidates.
+
+=back
+
+=head1 AUTHOR
+
+Dormando E<lt>L<dormando@rydia.net>E<gt>
+
+=head1 LICENSE
+
+Licensed for use and redistribution under the same terms as Memcached itself.
+
+=cut
+
+use warnings;
+use strict;
+
+use IO::Socket::INET;
+
+use FindBin;
+use Data::Dumper qw/Dumper/;
+use Getopt::Long;
+
+my %opts = ('sleep' => 10, automove => 0, verbose => 0, loops => 3);
+GetOptions(
+ "host=s" => \$opts{host},
+ "sleep=i" => \$opts{'sleep'},
+ "loops=i" => \$opts{loops},
+ "automove" => \$opts{automove},
+ "verbose" => \$opts{verbose},
+ ) or usage();
+
+die "Must specify at least --host='127.0.0.1:11211'" unless $opts{host};
+my $sock = IO::Socket::INET->new(PeerAddr => $opts{host},
+ Timeout => 3);
+die "$!\n" unless $sock;
+
+my %stats = ();
+my %move = (winner => 0, wins => 0);
+
+$SIG{INT} = sub {
+ print "STATS: ", Dumper(\%stats), "\n";
+ exit;
+};
+$SIG{USR1} = sub {
+ print "STATS: ", Dumper(\%stats), "\n";
+};
+run();
+
+sub usage {
+ print qq{Usage:
+ mc_slab_ratios --host="127.0.0.1:11211" --verbose --automove
+ run `perldoc mc_slab_ratios` for full information
+
+};
+ exit 1;
+}
+
+sub run {
+ my $slabs_before = grab_stats();
+
+ while (1) {
+ sleep $opts{'sleep'};
+ my $slabs_after = grab_stats();
+
+ my ($totals, $sorted) = calc_results_evicted($slabs_before, $slabs_after);
+# my ($totals, $sorted) = calc_results_numratio($slabs_before, $slabs_after);
+
+ my $pct = sub {
+ my ($num, $divisor) = @_;
+ return 0 unless $divisor;
+ return ($num / $divisor);
+ };
+ if ($opts{verbose}) {
+ printf " %02s: %-8s (pct ) %-10s (pct ) %-6s (pct ) get_hits (pct ) cmd_set (pct )\n",
+ 'sb', 'evicted', 'items', 'pages';
+ for my $slab (@$sorted) {
+ printf " %02d: %-8d (%.2f%%) %-10s (%.4f%%) %-6d (%.2f%%) %-8d (%.3f%%) %-7d (%.2f%%)\n",
+ $slab->{slab}, $slab->{evicted_d},
+ $pct->($slab->{evicted_d}, $totals->{evicted_d}),
+ $slab->{number},
+ $pct->($slab->{number}, $totals->{number}),
+ $slab->{total_pages},
+ $pct->($slab->{total_pages}, $totals->{total_pages}),
+ $slab->{get_hits_d},
+ $pct->($slab->{get_hits_d}, $totals->{get_hits_d}),
+ $slab->{cmd_set_d},
+ $pct->($slab->{cmd_set_d}, $totals->{cmd_set_d});
+ }
+ }
+
+ next unless @$sorted;
+ my $highest = $sorted->[-1];
+ $stats{$highest->{slab}}++;
+ print " (winner: ", $highest->{slab}, " wins: ", $stats{$highest->{slab}}, ")\n";
+ automove_basic($totals, $sorted) if ($opts{automove});
+
+ $slabs_before = $slabs_after;
+ }
+}
+
+sub grab_stats {
+ my %slabs = ();
+ for my $stat (qw/items slabs/) {
+ print $sock "stats $stat\r\n";
+ while (my $line = <$sock>) {
+ chomp $line;
+ last if ($line =~ m/^END/);
+ if ($line =~ m/^STAT (?:items:)?(\d+):(\S+) (\S+)/) {
+ my ($slab, $var, $val) = ($1, $2, $3);
+ $slabs{$slab}->{$var} = $val;
+ }
+ }
+ }
+
+ return \%slabs;
+}
+
+# Really stupid algo, same as the initial algo built into memcached.
+# If a slab "wins" most evictions 3 times in a row, pick from a slab which
+# has had 0 evictions 3 times in a row and move it over.
+sub automove_basic {
+ my ($totals, $sorted) = @_;
+
+ my $source = 0;
+ my $dest = 0;
+ my $high = $sorted->[-1];
+ return unless $high->{evicted_d} > 0;
+ if ($move{winner} == $high->{slab}) {
+ $move{wins}++;
+ $dest = $move{winner} if $move{wins} >= $opts{loops};
+ } else {
+ $move{wins} = 1;
+ $move{winner} = $high->{slab};
+ }
+ for my $slab (@$sorted) {
+ my $id = $slab->{slab};
+ if ($slab->{evicted_d} == 0 && $slab->{total_pages} > 2) {
+ $move{zeroes}->{$id}++;
+ $source = $id if (!$source && $move{zeroes}->{$id} >= $opts{loops});
+ } else {
+ delete $move{zeroes}->{$slab->{slab}}
+ if exists $move{zeroes}->{$slab->{slab}};
+ }
+ }
+
+ if ($source && $dest) {
+ print " slabs reassign $source $dest\n";
+ print $sock "slabs reassign $source $dest\r\n";
+ my $res = <$sock>;
+ print " RES: ", $res;
+ } elsif ($dest && !$source) {
+ print "FAIL: want to move memory to $dest but no valid source slab available\n";
+ }
+}
+
+# Using just the evicted stats.
+sub calc_results_evicted {
+ my ($slabs, $totals) = calc_slabs(@_);
+ my @sorted = sort { $a->{evicted_d} <=> $b->{evicted_d} } values %$slabs;
+ return ($totals, \@sorted);
+}
+
+# Weighted ratios of evictions vs total stored items
+# Seems to fail as an experiment, but it tries to weight stats.
+# In this case evictions in underused classes tend to get vastly inflated
+sub calc_results_numratio {
+ my ($slabs, $totals) = calc_slabs(@_, sub {
+ my ($sb, $sa, $s) = @_;
+ if ($s->{evicted_d}) {
+ $s->{numratio} = $s->{evicted_d} / $s->{number};
+ } else { $s->{numratio} = 0; }
+ });
+ my @sorted = sort { $a->{numratio} <=> $b->{numratio} } values %$slabs;
+ return ($totals, \@sorted);
+}
+
+sub calc_slabs {
+ my ($slabs_before, $slabs_after, $code) = @_;
+ my %slabs = ();
+ my %totals = ();
+ for my $id (keys %$slabs_after) {
+ my $sb = $slabs_before->{$id};
+ my $sa = $slabs_after->{$id};
+ next unless ($sb && $sa);
+ my %slab = %$sa;
+ for my $key (keys %slab) {
+ # Add totals, diffs
+ if ($slab{$key} =~ m/^\d+$/) {
+ $totals{$key} += $slab{$key};
+ $slab{$key . '_d'} = $sa->{$key} - $sb->{$key};
+ $totals{$key . '_d'} += $sa->{$key} - $sb->{$key};
+ }
+ }
+ # External code
+ $code->($sb, $sa, \%slab) if $code;
+ $slab{slab} = $id;
+ $slabs{$id} = \%slab;
+ }
+ return (\%slabs, \%totals);
+}
View
74 slabs.c
@@ -627,16 +627,83 @@ static void slab_rebalance_finish(void) {
}
}
+/* Return 1 means a decision was reached.
+ * Move to its own thread (created/destroyed as needed) once automover is more
+ * complex.
+ */
+static int slab_automove_decision(int *src, int *dst) {
+ static uint64_t evicted_old[POWER_LARGEST];
+ static unsigned int slab_zeroes[POWER_LARGEST];
+ static unsigned int slab_winner = 0;
+ static unsigned int slab_wins = 0;
+ uint64_t evicted_new[POWER_LARGEST];
+ uint64_t evicted_diff = 0;
+ uint64_t evicted_max = 0;
+ unsigned int highest_slab = 0;
+ unsigned int total_pages[POWER_LARGEST];
+ int i;
+ int source = 0;
+ int dest = 0;
+ static rel_time_t next_run;
+
+ /* Run less frequently than the slabmove tester. */
+ if (current_time >= next_run) {
+ next_run = current_time + 10;
+ } else {
+ return 0;
+ }
+
+ item_stats_evictions(evicted_new);
+ pthread_mutex_lock(&cache_lock);
+ for (i = POWER_SMALLEST; i < power_largest; i++) {
+ total_pages[i] = slabclass[i].slabs;
+ }
+ pthread_mutex_unlock(&cache_lock);
+
+ /* Find a candidate source; something with zero evicts 3+ times */
+ for (i = POWER_SMALLEST; i < power_largest; i++) {
+ evicted_diff = evicted_new[i] - evicted_old[i];
+ if (evicted_diff == 0 && total_pages[i] > 2) {
+ slab_zeroes[i]++;
+ if (source == 0 && slab_zeroes[i] >= 3)
+ source = i;
+ } else {
+ slab_zeroes[i] = 0;
+ if (evicted_diff > evicted_max) {
+ evicted_max = evicted_diff;
+ highest_slab = i;
+ }
+ }
+ evicted_old[i] = evicted_new[i];
+ }
+
+ /* Pick a valid destination */
+ if (slab_winner != 0 && slab_winner == highest_slab) {
+ slab_wins++;
+ if (slab_wins >= 3)
+ dest = slab_winner;
+ } else {
+ slab_wins = 1;
+ slab_winner = highest_slab;
+ }
+
+ if (source && dest) {
+ *src = source;
+ *dst = dest;
+ return 1;
+ }
+ return 0;
+}
+
/* Slab rebalancer thread.
* Does not use spinlocks since it is not timing sensitive. Burn less CPU and
* go to sleep if locks are contended
*/
static void *slab_maintenance_thread(void *arg) {
int was_busy = 0;
+ int src, dest;
while (do_run_slab_thread) {
- /* TODO: Call code to make a calculated decision */
-
if (slab_rebalance_signal == 1) {
if (slab_rebalance_start() < 0) {
/* Handle errors with more specifity as required. */
@@ -646,6 +713,9 @@ static void *slab_maintenance_thread(void *arg) {
} else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
/* If we have a decision to continue, continue it */
was_busy = slab_rebalance_move();
+ } else if (settings.slab_automove && slab_automove_decision(&src, &dest) == 1) {
+ /* Blind to the return codes. It will retry on its own */
+ slabs_reassign(src, dest);
}
if (slab_rebal.done) {

0 comments on commit 99fc043

Please sign in to comment.