/
paper.tex
2354 lines (2086 loc) · 97.4 KB
/
paper.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% can be 9pt
\documentclass{sigplanconf}
\usepackage{url}
\usepackage{tikz}
\usepackage{graphicx}
\usepackage{framed}
\usepackage{color}
\usepackage{url}
\usepackage{alltt}
\usepackage{tabularx}
\usepackage{subfigure}
\usepackage{fancyhdr}
\usepackage{paralist}
\usepackage{multirow}
\usepackage{pgf}
\usepackage{listings}
\lstset{ %
language=Haskell, % the language of the code
%%basicstyle= \scriptsize, % the size of the fonts that are used
%% for the code
%%firstnumber=0, % index to start numbering at
%%numbers=left, % where to put the line-numbers
%% numberstyle=\tiny\color{gray}, % the style that is used for the
%% line-numbers
%% stepnumber=5, % the step between two line-numbers. If it's
% 1, each line
%% % will be numbered
%% numbersep=5pt, % how far the line-numbers are from the code
%% xleftmargin=0ex,
%% backgroundcolor=\color{white}, % choose the background color. You must
%% add \usepackage{color}
%% showspaces=false, % show spaces adding particular underscores
%% showstringspaces=false, % underline spaces within strings
%% showtabs=false, % show tabs within strings adding
%% particular underscores
%% frame=single, % adds a frame around the code
%% rulecolor=\color{black}, % if not set, the frame-color may be
%% changed on line-breaks within not-black text (e.g. commens (green here))
%% tabsize=2, % sets default tabsize to 2 spaces
%% captionpos=b, % sets the caption-position to bottom
%% breaklines=true, % sets automatic line breaking
%% breakatwhitespace=false, % sets if automatic breaks should only
%% happen at whitespace
%% title=\lstname, % show the filename of files included
%% with \lstinputlisting;
%% % also try caption instead of title
%% keywordstyle=\color{blue}, % keyword style
%% commentstyle=\color{dkgreen}, % comment style
%% stringstyle=\color{mauve}, % string literal style
escapeinside={@*}{*@}, % if you want to add a comment within your
% code
%% morekeywords={*,...} % if you want to add more keywords to
%% the set
}
%% \usepackage{tikz}
%% \usetikzlibrary{arrows,automata}
%% \usepackage{amsmath}
%% \usepackage{amsthm}
%% \newtheoremstyle{lst}
%% {\topsep} % ABOVESPACE
%% {\topsep} % BELOWSPACE
%% {\itshape} % BODYFONT
%% {0pt} % INDENT (empty value is the same as 0pt)
%% {} % HEADFONT
%% {} % HEADPUNCT
%% {5pt plus 1pt minus 1pt} % HEADSPACE
%% {(\thmnumber{#2})} % CUSTOM-HEAD-SPEC
%% \theoremstyle{lst}
%% \newtheorem{codenum}{lst}
\usepackage{float}
\floatstyle{boxed}
\restylefloat{figure}
%% \include{math}
\newenvironment{code}{\begin{alltt}\footnotesize}{\end{alltt}}
%% \newenvironment{cols}{\begin{tabular}{m{3.6cm}|m{3.6cm}}{Haskell} &
\newcommand{\ttp}[1]{\texttt{#1}}
\usepackage{ifthen}
\newboolean{submission} %set to true for the submission version
\setboolean{submission}{false}
%\setboolean{submission}{true}
\ifthenelse
{\boolean{submission}}
{ \newcommand{\todo}[1]{ } } %hide todo
{ \newcommand{\todo}[1]{ {\color{blue}$<<$#1$>>$}
}}
%\usepackage{fancyhdr}
\newcommand{\sub}[1]{\(\sb{#1}\)}
\newcommand{\undef}[1]{{\bf #1}}
\newcommand{\sci}[2]{\ensuremath{#1 \times 10^{#2}}}
\begin{document}
%% \conferenceinfo{ICFP'12,} {September 9--15, 2012, Copenhagen, Denmark.}
%% \CopyrightYear{2012}
%% \copyrightdata{978-1-4503-1054-3/12/09}
%\titlebanner{banner above paper title} % These are ignored unless
%\preprintfooter{} % 'preprint' option specified.
\title{SmartCheck: Automatic and Efficient Counterexample Reduction and Generalization}
\subtitle{UNPUBLISHED DRAFT}
\authorinfo{Lee Pike}
{Galois, Inc.}
{leepike@galois.com}
\maketitle
\begin{abstract}
QuickCheck, developed by Claessen and Hughes, is a powerful library for
automatic test-case generation. Because QuickCheck performs random testing,
some of the counterexamples discovered are very large. QuickCheck provides an
interface for the user to write shrink functions to attempt to reduce the size
of counterexamples. Hand-written implementations of shrink can be complex,
inefficient, and consist of significant boilerplate code. Furthermore,
shrinking is only one aspect in debugging: counterexample generalization is the
process of extrapolating from individual counterexamples to a class of
counterexamples, often requiring a flash of insight from the programmer. To
improve counterexample reduction and generalization, we introduce
\emph{SmartCheck}. SmartCheck is a debugging tool that reduces algebraic data
using generic search heuristics to efficiently find smaller counterexamples. In
addition to shrinking, SmartCheck also automatically generalizes counterexamples
to formulas representing classes of counterexamples. SmartCheck has been implemented for Haskell and is freely available.
\end{abstract}
%% \category{D.2.4}{Software/Program Verification}{Reliability}
%% \terms
%% Languages, Verification
%% \keywords
%% embedded domain-specific language, compiler, verification
\section{Introduction}\label{sec:intro}
The QuickCheck testing framework was a revolutionary step-forward in
type-directed random testing~\cite{qc,monadic}. Originally designed for Haskell,
QuickCheck has been ported to other languages and is a now a widely-used testing
tool. Because QuickCheck generates random values for testing, counterexamples
it finds may be substantially larger than a minimal counterexample. In their
original QuickCheck paper~\cite{qc}, the authors report the following user
experience by Andy Gill:
%
\begin{quote}
Sometimes the counterexamples found are very large and it is difficult to go
back to the property and understand why it is a counterexample.
\end{quote}
That is, automated testing helps find a counterexample, but it may be a very
large counterexample. We are left facing the \emph{shrinking problem}: assume
there exist a set of inputs $i_0, \ i_1, \ldots, \i_n$, such that for program
$P$ (modeled as a function mapping inputs to output) and property $\phi$,
$\phi(P(i_0, \ i_1, \ldots, \i_n))$ is false. The shrinking problem is to find
the smallest set of inputs $i'_0, \ i'_1, \ldots, \i'_n$, according to some
measure, such that $\phi(P(i\_0, \ i\_1, \ldots, \i\_n))$ is also false.
In general, the problem too expensive to solve, since it requires showing that
for a set of failing inputs, there exist no strictly smaller inputs that also
cause the property to fail. Such a demonstration requires exhaustive testing or
proof. Solutions to the shrinking problem are therefore approximated to
efficiently discover sufficiently small inputs causing the input to fail.
Usually, the shrunk inputs are derived algorithmically (but perhaps
nondeterministically) from a known larger set of inputs resulting in failure,
and the inputs may not be minimal.
To address the shrinking problem, QuickCheck defines a type class
\ttp{Arbitrary} that presents a method \ttp{arbitrary} for generating random
values of a given type. Gill added another method to the type class:
%
\begin{code}
smaller :: a -> [a]
\end{code}
%
\noindent
The purpose of \ttp{smaller} is to generate strictly smaller values, according
to some measure, from a given counterexample. These new values are then tested
to attempt to find a smaller counterexample. Today, \ttp{smaller} is called
\ttp{shrink}.
In industrial uses, shrinking is essential. In describing commercial
applications of QuickCheck, Hughes has noted that ``without it [shrinking],
randomly generated failing cases would often be so large as to be almost
useless.''~\cite{qcjh}. Hughes~\emph{et al.} also give an extended example in
which shrinking is essential in debugging telecom software~\cite{telecom}.
Defining an efficient and effective shrink method requires a good understanding
of how shrinking in QuickCheck works and the semantics of the property and
program being evaluated. Bad definitions can be so slow or so ineffective at
shrinking that they are unusable.
In addition, shrinking is one side of the coin when it comes to making
counterexamples more understandable: the other side is extrapolation from
individual counterexamples to a class of counterexamples. This leap of
abstraction is often implicitly made by the programmer in determining the reason
why counterexamples fail the property. For example, the following is a
relatively small counterexample returned when QuickChecking a property in (a
bug-injected version of) Xmonad, a popular X11 window manager written in
Haskell~\cite{xmonad}:
%
\medskip%
\begin{sloppypar}
%\footnotesize
\noindent%
\ttp{StackSet \{current = Screen \{workspace = Workspace \{tag = NonNegative
\{getNonNegative = 0\}, layout = -1, stack = Just (Stack \{focus = 'S', up =
"", down = ""\})\}, screen = 1, screenDetail = 1\}, visible = [Screen
\{workspace = Workspace \{tag = NonNegative \{getNonNegative = 2\}, layout = -1,
stack = Nothing\}, screen = 2, screenDetail = -1\},Screen \{workspace =
Workspace \{tag = NonNegative \{getNonNegative = 3\}, layout = -1, stack =
Nothing\}, screen = 0, screenDetail = -1\}], hidden = [Workspace \{tag =
NonNegative \{getNonNegative = 1\}, layout = -1, stack = Just (Stack \{focus
= `\char`\\NUL', up = "", down = ""\})\},Workspace \{tag = NonNegative
\{getNonNegative = 4\}, layout = -1, stack = Just (Stack \{focus = 'I', up =
"", down = ""\})\}], floating = fromList []\} }
\end{sloppypar}
\medskip%
\noindent
(This counterexample uses Haskell's default \ttp{Show} instances, which uses
record syntax.) Programmers may be familiar with having to debug
a ``wall of text'' as shown above. What if instead a
formula like the following were returned, stating that for any well-typed
values $x_0$, $x_1$, $x_2$, and $x_3$, tested, a counterexample is found?
%
\begin{samepage}
\begin{code}
forall values x0 x1 x2 x3:
StackSet
(Screen (Workspace x0 (-1) (Just x1)) 1 1)
x2 x3 (fromList [])
\end{code}
\end{samepage}
%
\noindent
The formula quantifies away all the irrelevant portions of the data structure
with respect to the property, so that the user can focus on the heart of the
problem in a class of counterexamples.
\paragraph{SmartCheck}
Motivated by the problems of reducing and generalizing large counterexamples, we
developed \emph{SmartCheck}. SmartCheck takes a counterexample produced by some
oracle and generically minimizes and generalizes the counterexample.
As shown in the motivating example in Section~\ref{sec:example}, SmartCheck
can shrink more than an order-of-magnitude faster than even the most efficient hand-written
shrink implementations for some properties and programs. As well, the time
required to shrink scales linearly with the size of the counterexample---other
common shrink implementations often scale in polynomial time or worse.
(Shrinking time is important in real-world uses; see
Section~\ref{sec:experiments} for examples.)
After presenting some preliminary definitions in
Section~\ref{sec:preliminaries}, in Section~\ref{sec:shrinking}, we describe the
generic shrinking algorithm implemented. The algorithm is particularly novel in
that it generates new random sub-values during the reduction phase (see related
work in Section~\ref{sec:related}).
%% than the tuple or list-truncation approaches
%% described earlier, and the size of resulting counterexamples are two-five times
%% smaller (more detailed performance numbers are given in
%% Section~\ref{sec:performance}). Furthermore, other shrink approaches tend to
%% have ``long tails,'' meaning that for some original counterexamples, that are
%% not shrunk much or take a very long time to shrink.
%% Motivated by these limitations of QuickCheck and SmallCheck, we developed
%% \emph{SmartCheck}.
SmartCheck implements three novel approaches to automatically generalize
counterexamples, which are described in Section~\ref{sec:generalization}. The
first algorithm universally quantifies sub-values that always fail in tests.
The second algorithm existentially quantifies sub-values for types in which
every possible variant fails the property. For example, finding counterexamples
(\ttp{Left 2}) and (\ttp{Right True}) for the type
%
\begin{code}
\ttp{Either Int Bool}
\end{code}
%
\noindent
means there exists a counterexample regardless of the variant chosen.
Existential generalization is useful for large sum types, as found in
abstract syntax tree (AST) definitions, for example.
The third algorithm automatically strengthens properties by omitting ``similar''
counterexamples to the ones previously observed. The algorithm is motivated by
noting that there are often multiple ways in which a property may fail; for
example, a property stating that pretty-printing an AST and then parsing it
results in the original AST may fail due to multiple bugs. During testing, it
is useful to discover counterexamples arising from each bug in one go. In
practice, the problem is solved by discovering a counterexample \ttp{cex},
abstracting it, and then adding a new precondition to the property that
informally says ``omit counterexamples of form \ttp{cex}.'' Adding
preconditions manually is laborious and may cause the programmer to make
premature fixes to the program, if she believes she has isolated the error
before she actually does.
We describe our implementation based on generic programming in
Section~\ref{sec:implementation}; the implementation is open-source. In
Section~\ref{sec:experiments}, we discuss some of our experiences with using
SmartCheck, including checking properties from XMonad and a natural language
processing library.
\section{A Motivating Example}\label{sec:example}
\begin{figure}[ht]
\begin{code}
type I = [Int16]
data T = T I I I I I
toList :: T -> [[Int16]]
toList (T i0 i1 i2 i3 i4) =
(map . map) fromIntegral [i0, i1, i2, i3, i4]
pre :: T -> Bool
pre t = all ((>) 256 . sum) (toList t)
post :: T -> Bool
post t = (sum . concat) (toList t) < 5 * 256
prop :: T -> Property
prop t = pre t ==> post t
\end{code}
\caption{Example program and property.}
\label{fig:initial}
\end{figure}
In this section, we motivate in more detail the challenges in shrinking
counterexamples by comparing manual approaches using QuickCheck to SmartCheck.
(We focus on shrinking rather than generalization here since counterexample
generalization is unique to SmartCheck.) We will show how a small data type and
simple property can result in large counterexamples without any shrinking. Then
we show the difficulty in designing an efficient shrink implementation manually,
which is required when using QuickCheck; we will walk through a couple of poor
designs before arriving at a ``canonical'' manual solution. However, even the
canonical solution produces significantly larger counterexamples and takes
significantly longer to produce them than SmartCheck, even though SmartCheck's
shrinking strategy is completely automatic and generic.
Consider the example in Figure~\ref{fig:initial}.\footnote{All examples and
algorithms in this paper are presented in Haskell 2010~\cite{haskell2010} plus
the language extensions of existential type quantification and scoped type
variables.} Data type \ttp{T} is a product type containing five lists of
signed 8-bit integers. For our purposes, it is not important what \ttp{T}
models, but for concreteness, suppose it models the values in five linked lists.
Now suppose we are modeling some program that serializes values of type \ttp{T}.
The input to the program satisfies the invariant \ttp{pre}, that the sum of
values in each list of \ttp{Int16}s is less than or equal to 256. Assuming this,
we want to show \ttp{post} holds, that the sum of all the values from \ttp{T} is
less than $5 * 256$, where five is the number of fields in \ttp{T}. At first
glance, the property seems reasonable. But we have forgotten about underflow;
for example, the value
%
\begin{code}
T [-20000] [-20000] [] [] []
\end{code}
%
\noindent
satisfies \ttp{pre} but fails \ttp{post} (the \ttp{==>} operator in the figure
is implication from the QuickCheck library).
Despite the simplicity of the example, a typical counterexample returned by
QuickCheck can be large. With standard settings and no shrinking, the median
counterexample discovered contains just under 70 \ttp{Int16} values, and
counterexamples can contain over 100 values. Thus, it pays to define
\ttp{shrink}!
We might first naively try to shrink counterexamples for a data type like
\ttp{T} by taking the cross-product of shrunk values over the arguments to the
constructor \ttp{T}. This can be expressed using Haskell's list-comprehension
notation:
%
\begin{code}
shrink (T w i0 i1 i2 i3) =
[ T a b c d e | a <- shrink w, b <- shrink i0
, c <- shrink i1, d <- shrink i2
, e <- shrink i3 ]
\end{code}
%
\noindent
While the above definition appears reasonable on first glance, there are two
problems with it. First, the result of (\ttp{shrink t}) is null if any list
contained in \ttp{t} is null, in which case \ttp{t} will not be shrunk. More
troublesome is that with QuickCheck's default \ttp{Arbitrary} instances, the
length of potential counterexamples returned by \ttp{shrink} can easily exceed
$10^{10}$, which is an intractable number of tests for QuickCheck to analyze.
The reason for the blowup is that shrinking a list \ttp{[a]} produces a list
\ttp{[[a]]}, a list of lists; each list is a new shrinking of the original list.
Then, we take the cross-product of the generated lists. For the example above,
with some counterexamples, the shrinking stage may appear to execute for hours,
consuming ever more memory, without returning a result.
The first way we will control the complexity is focusing on structurally
shrinking the data type rather than shrinking the base types. We prevent
QuickCheck from using its default shrink instance for \ttp{Int16} by defining a
newtype:
%
\begin{code}
newtype T = T { getInt :: Int16 }
\end{code}
%
\noindent
Without shrinking the \ttp{Int16}s, the performance is dramatically improved.
Next, a programmer might try to control the complexity by truncating lists using
truncation where
%
\begin{code}
take n ls
\end{code}
%
returns the first $n$ elements \ttp{ls}. The trade-off is quicker shrinking
with a lower probability of finding a smaller counterexample. For example, we
might redefine shrink as follows:
%
\begin{samepage}
\begin{code}
shrink (T w i0 i1 i2 i3) =
[ T a b c d e | a <- tk w
, b <- tk i0, c <- tk i1
, d <- tk i2, e <- tk i3 ]
where tk x = take 10 (shrink x)
\end{code}
\end{samepage}
%
\noindent
Call this version ``truncated shrink''. Truncation controls the blowup of the
input-space, but the downside is that potentially smaller counterexamples may be
omitted.
Someone who understands the semantics of QuickCheck's \ttp{shrink}
implementation defines a \ttp{shrink} instance as follows:\footnote{The
following approach was suggested to the author by John Hughes.}
%
\begin{code}
shrink (T w i0 i1 i2 i3) = map go xs
where xs = shrink (w, i0, i1, i2, i3)
go (w', i0', i1', i2', i3')
= T w' i0' i1' i2' i3'
\end{code}
%
\noindent
This ``tuple shrink'' definition does not suffer the same shortcomings: it
shrinks a value even if it contains an empty list, and the combinatorial blowup
of shrink candidates is avoided, since a pair \ttp{(a,b)} is shrunk by
attempting to shrink \ttp{a} while holding \ttp{b} constant, then attempting to
shrink \ttp{b} while holding \ttp{a} constant (and is generalized from pairs to
arbitrary tuples). Thus, each argument to \ttp{T} is independently shrunk.
Still, using this definition, from some initial counterexamples, shrinking may
result in a final counterexample containing just over 100 \ttp{Int16} values.
SmallCheck is another testing framework for Haskell for which shrinking is
irrelevant: SmallCheck is guaranteed to return a smallest counterexample, if
one exists~\cite{sc}. SmallCheck does this by enumerating all possible inputs,
ordered from smallest to largest, up to some user-defined bound. While
SmallCheck is effective for testing many programs and properties (in accordance
with the \emph{small scope hypothesis}~\cite{jackson}), counterexamples to even
relatively simple properties may be infeasible to discover due to input-space
explosion.
%% \begin{figure}
%% \begin{code}
%% instance Serial Int16 where
%% series d = drawnFrom [(-d')..d']
%% where d' = fromIntegral d
%% instance Serial A where series = cons1 A
%% instance Serial B where series = cons4 B
%% \end{code}
%% \caption{SmallCheck instances for the data types in
%% Figure~\ref{fig:initial}.}
%% \label{fig:smallshrink}
%% \end{figure}
For a product type, SmallCheck must check values produced by taking the
cross-product of the type's fields at a given depth. Unfortunately, using
default instances, SmallCheck fails long before it reaches a depth required to
discover a counterexample. (A related library named Feat combines some aspects
of SmallCheck and QuickCheck~\cite{feat}; we discuss it in
Section~\ref{sec:related}).
%% is unable to verify the property from Figure~\ref{fig:initial}, for much the
%% same reason that \ttp{shrink} is inefficient
%% For the property \ttp{prop} defined in Figure~\ref{fig:initial} (suitably
%% redefined to use SmallCheck's types).
%% to depth 16 to find the first counterexample. Unfortunately, after a couple
%% hours of testing, Lazy SmallCheck is still checking values at depth six, and
%% the number of tests scales with respect to the depth.\footnote{All tests in
%% this paper are performed on a four-core Pentium~i7 running at 2.7GHz with 8GB
%% RAM on Fedora~16. This test, and others unless noted, are performed using
%% interpreted Haskell under GHC 7.4.2.\todo{redo tests?}}
%% For example, for type \ttp{Int}, the depth $d$ is defined as the integers in
%% the list enumerating from $-d$ to $d$ (i.e., \ttp{[(-d)..d]}).
%% Figure~\ref{fig:smallshrink} contains functions for generating Lazy
%% SmallCheck (a variant of SmallCheck) tests by defining instances for the
%% \ttp{Serial} class.
%% Finally, even if we could overcome the problems described above with QuickCheck
%% and SmallCheck, we are left with two others.
%
%% Second, during testing, it is sometimes helpful to cover the input space
%% by discovering different counterexamples to \ttp{prop}. In the example in
%% Figure~\ref{fig:initial}, one counterexample is fairly representative of
%% others, but this is not true in general (e.g., consider properties about
%% abstract syntax trees). Ideally, the testing framework would automatically
%% add a predicate characterizing a counterexample as the precondition to
%% \ttp{prop} that strengthens the property. That is, for a specific
%% counterexample \ttp{ex}, we have a predicate \ttp{predEx} that holds if its
%% argument is another counterexample that is similar to (i.e., some
%% generalization of) \ttp{ex}. Then we test against a new property
%% %
%% \begin{code}
%% prop' a = not (cex a) ==> prop a
%% \end{code}
%
%% \noindent
%% where \ttp{==>} is QuickCheck's implication operator that discards tests that
%% violate the precondition.
%\end{itemize}
\begin{figure}[ht]
\includegraphics[scale=0.6]{../regression/PaperExample1/out/data}
%% \includegraphics[scale=0.6]{../regression/PaperExample1/out/time-big}
%% \includegraphics[scale=0.6]{../regression/PaperExample1/out/time-small}
\caption{Results for 1000 tests of the program in Figure~\ref{fig:initial}.}
\label{fig:graphs}
\end{figure}
\begin{table}[ht]
\footnotesize
\begin{center}
\begin{tabular}{|r||c|c|c|c|}
\hline
& QC none & QC trunc & QC tuple & SmartCheck\\
\hline \hline
Mean & 81, 0.013s & 40, 0.231s & 13, 0.228s & 6, 0.022s\\
\hline
Std. dev. & 21, 0.001 & 21, 1.742 & 7, 0.962 & 4, 0.001s\\
% Median & 0.009s, 67 & 0.17s, 33 & 1.81s, 13 & 0.10s, 6\\
%% \hline
%% MAD & 0.02s, 13 & 0.09s, 8 & 0.62s, 3 & 0.02s, 2\\
\hline
95\% & 116, 0.025s & 84, 0.283s & 26, 0.501s & 13, 0.039s\\
\hline
\end{tabular}
\end{center}
\caption{Summarizing data for the graph in Figure~\ref{fig:graphs}. Entries
contain execution time (in seconds) and counterexample sizes (counting
\ttp{Int16} values).}
\label{table:results}
\end{table}
To motivate the benefits of SmartCheck reduction algorithm, consider
Table~\ref{table:results} and the corresponding graphs in
Figure~\ref{fig:graphs}, giving the results for using QuickCheck and SmartCheck
on the program shown in Figure~\ref{fig:initial} (we omit SmallCheck, since as
noted, it is not even feasible for the example with default
instances).\footnote{All results reported in the paper are with a
GHC-7.6.3-compiled program, using \ttp{-O2}, on a 2.8 GHz 4 core i7 with 16GB
memory under OSX v. 10.9.2.} We graph the final counterexample size and
execution time.
The experiments show three approaches to shrinking with QuickCheck: no shrinking
(QC none), to provide a baseline, truncated shrinking (QC trunc), as described
above, in which each list is truncated to 10 elements and \ttp{Int16} values are
not shrunk, and tuple shrinking (QC tuple), also described above, and
\ttp{Int16} values are also not shrunk.
With each shrinking strategy, we generate 1000 runs in which a random
counterexample is discovered (by QuickCheck), and then shrunk. We plot the
number of runs resulting in a shrunk counterexample of a given size.
In Table~\ref{table:results}, we summarize the graphed values and provide
execution time in seconds as well. For both execution time and final value
size, we summarize the mean, median, standard deviation, and the results at the
95th percentile. (While we provide the standard deviations, note that the plots
are not necessarily Gaussian.)
The execution time includes the time to
discover an initial counterexample as well as time to shrink it.
SmartCheck produces smaller counterexamples than the other approaches, with a
very small tail.
More dramatic, however, are the differences in execution time. The execution
times for SmartCheck include the initial counterexample generation time required
by QuickCheck (QC none); thus the time required by SmartCheck to shrink a
counterexample is on average just two hundredths of a second (0.10 - 0.08). As
can be seen in the graphs, SmartCheck's execution time scales linearly (like
QuickCheck without shrinking). Truncation and tuple shrinking produce very long
tails; the longest execution time recorded for truncation shrinking and tuple
shrinking are 88 seconds and 110 seconds, respectively, although these outliers
can vary wildly. SmartCheck is largely insensitive to the size of the original
counterexample provided.
%% In regards to generalization, after shrinking the counterexample, SmartCheck
%% will also generalize it. Note that for any \ttp{Word8} value in the first
%% argument of \ttp{T}, the counterexample still holds. For example, SmartCheck
%% might return a counterexample of the form
%% %
%% \begin{code}
%% forall x . T x [-20000] [-20000] [] []
%% \end{code}
Finally, these performance results for SmartCheck are fairly generalizable to
other types and properties, since the algorithm is not dependent on the specific
program or property being checked. However, the results are particularly
pronounced for counterexamples that are particularly ``deep'' or ``broad'' in
terms of their tree structure. %% For programs and properties for which small
%% counterexamples are easily found, QuickCheck's shrinking is perfectly
%% acceptable. Furthermore, user-defined shrink methods may vary significantly
%% based on the property and counterexample type, so it is not always clear which
%% implementation to compare against.
We now describe in detail SmartCheck's algorithm for generic shrinking.
%% \begin{itemize}
%% \item an efficient counterexample reduction strategy for algebraic
%% data types (Section~\ref{sec:shrinking});
%\item an approach to counterexample generalization, to present the user with a
%% formula describing a set of counterexamples to a property
%% (Section~\ref{sec:generalization});
%% \item an approach to generically strengthen a property with a precondition that
%% characterizes an already-discovered counterexample for the purpose of finding
%% new, dissimilar counterexamples (Section~\ref{sec:generalization});
%\item
%% \item More generally, this paper is the first to explore the idea of
%% efficient, generic counterexample reduction and generalization for
%% functional program testing.
% \end{itemize}
%% \noindent Moreover, the inability to quickly uncover understandable
%% counterexamples is a real-world problem with QuickCheck (or other functional
%% languages). It works great when your tests pass,
%% \paragraph{Outline.}
%% The paper precedes as follows. We define a type class and some definitions in
%% Section~\ref{sec:preliminaries} used in the remainder of the paper. In
%% Section~\ref{sec:shrinking}, we describe SmartCheck's reduction algorithm for
%% efficiently shrinking large counterexamples. In
%% Section~\ref{sec:generalization}, we turn our attention to the problem of
%% counterexample generalization. Here, we present two algorithms for generalizing
%% counterexamples and constructing formulas for describing them. We also discuss
%% a method for property strengthening to discover new counterexamples. We
%% describe implementation-specific details in Section~\ref{sec:implementation},
%% and the results of some experiments in using our tool are presented in
%% Section~\ref{sec:experiments}. Related work is given in
%% Section~\ref{sec:related}. Finally, concluding remarks and future work are
%% presented in Section~\ref{sec:conclusions}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Preliminaries}\label{sec:preliminaries}
SmartCheck focuses on algebraic data types represented as ``sums of products''
including recursive and mutually recursive types. We sometimes refer to the
elements of a sum type as a \emph{variant} that is \emph{tagged} by a
constructor. For example, the type
%
\begin{code}
Maybe a = Just a | Nothing
\end{code}
%
\noindent
contains two variants, tagged by the constructors \ttp{Just} and \ttp{Nothing},
respectively.
To present the algorithms in the following sections, we provide some definitions
in the form of methods of a \ttp{SubTypes} type class. (SmartCheck requires
instances to be defined for data being analyzed; in
Section~\ref{sec:implementation}, we describe how instances for the type class
are derived automatically.)
%
\begin{code}
type Idx = Int
type Size = Int
data SubVal = forall a. SubTypes a => SubVal a
class Arbitrary a => SubTypes a where
size :: a -> Size
index :: a -> Idx -> Maybe SubVal
replace :: a -> Idx -> SubVal -> a
constr :: a -> String
constrs :: a -> [String]
opaque :: a -> Bool
subVals :: a -> Tree SubVal
\end{code}
%
\noindent
The \ttp{SubTypes} type class requires QuickCheck's \ttp{Arbitrary} as a
super-class. \ttp{SubTypes} has the following methods:
\begin{itemize}
\item \ttp{size} returns the size of a value---intuitively, the number of
constructors contained within it,
\item \ttp{index} returns a \emph{sub-value} at a breadth-first index in a
value,
\item \ttp{replace} replaces a sub-value at a particular focus (returning the
original value if the index is out-of-bounds),
\item \ttp{constr} returns a string representation of the constructor tagging
the value,
\item \ttp{constrs} returns the list of all possible constructors from the
value's type,
\item \ttp{opaque} is false when the type of the value is an ``interesting
type''; informally, this is a type other than a primitive type like \ttp{Int}
or \ttp{Char}. See Section~\ref{sec:base} for a full discussion.
\item \ttp{subVals} returns a tree of all non opaque-type sub-values. A tree is a
value with the type
%
\begin{code}
data Tree a = Node \{ rootLabel :: a
, subForest :: [Tree a] \}
\end{code}
%
\end{itemize}
\noindent
To illustrate typical evaluations of the methods, consider a binary tree type:
%
\begin{code}
data T = L | B T T
\end{code}
%
\noindent
and the value \ttp{tree}, labeled with indexes in a breadth-first order:
%
\begin{code}
tree = B\sub{0} (B\sub{1} L\sub{3}
(B\sub{4} L\sub{6} L\sub{8}))
(B\sub{2} L\sub{5} L\sub{7})
\end{code}
%
\noindent
Here are example applications of \ttp{SubTypes} methods; in the following, we
show the indexes with respect to the value \ttp{tree}:
%
\begin{code}
size tree = 9
index tree 4 = (Just . SubVal) (B\sub{4} L\sub{6} L\sub{8})
index tree 12 = Nothing
replace tree 2 (SubVal L) =
B\sub{0} (B\sub{1} L\sub{3}
(B\sub{4} L\sub{6} L\sub{8}))
L
constr tree = ["B"]
constrs tree = ["B", "L"]
constrs L = ["B", "L"]
opaque (3 :: Int) = True
opaque tree = False
opaque L = False
subVals (B (B L\sub{0} L\sub{1}) L\sub{2}) =
Node (B L\sub{0} L\sub{1}) [Node L\sub{0} [], Node L\sub{1} []]
(Node L\sub{1} [])
\end{code}
%
\noindent
The \ttp{SubVal} type is an existential data type, used as a generic container
for sub-values from a counterexample. We will sometimes refer to the unwrapped
value returned by \ttp{index a i} as the \emph{$i$th sub-value of $a$}, so for
example, (\ttp{B\sub{4} L\sub{6} L\sub{8}}) is the 4th sub-value of \ttp{tree}.
An invariant of \ttp{index} is that for any value \ttp{a}, and for the smallest
$i \geq 0$ such that
%
\begin{code}
index a i == Nothing
\end{code}
%
\noindent
then for all $0 \leq j < i$,
%
\begin{code}
index a j /= Nothing
\end{code}
\noindent
We use this invariant as a termination case in recursive algorithms over the
sub-values of a value. (Rather than indexes into a data-structure, an
alternative representation is to use a zipper data structure~\cite{zipper} to
traverse data. We have chosen explicit indexes to write simple tail-recursive
algorithms that can easily be transcribed to imperative languages.)
In our implementation, \ttp{constr} and \ttp{constrs} depends on GHC
Generics~\cite{generics}, which we describe in Section~\ref{sec:implementation}.
For simplicity, we omit here Generics-specific super-class constraints on the
\ttp{SubTypes} class. Moreover, our presentation simplifies the implementation
(Section~\ref{sec:implementation}) somewhat to improve the presentation.
%% Finally, \ttp{opaque} is true for our tree type but false for a
%% structureless type, such as \ttp{Int}.
% ------------------------------------------------------------------------------
\section{Shrinking Data}\label{sec:shrinking}
In this section, we describe how to efficiently and generically shrink algebraic
data values. Recall the basic idea behind the \ttp{shrink}
method of the \ttp{Arbitrary} class: generate a list of values, each of which is
smaller than the current counterexample. Each of the new values generated
may not bear any relationship to the original counterexample other than being
smaller. SmartCheck pursues an approach that searches for smaller but
structurally similar counterexamples, as we make precise below.
We describe the algorithm in Section~\ref{sec:reduct} and then describe
algorithmic details in Section~\ref{sec:details}. Some optimizations to the
reduction algorithm are described in Section~\ref{sec:optimizations}.
%% Consider the problem of finding a small counterexample as a graph search
%% problem. The set of vertexes are all possible well-typed values. Let $<$ be
%% a partial ordering on values relating them by some size measure. Let there
%% be an edge from vertex $v$ to $v'$ if $v < v'$ and there exists no $v''$ such
%% that $v < v'' < v'$. The goal is to traverse the graph, starting from a
%% counterexample, to find a vertex from the set of smallest counterexamples in
%% the graph.
%% The approach taken by typical \ttp{shrink} implementations is the most naive
%% graph search algorithm: it simply generates a random set of potential vertexes
%% that are strictly smaller according to the given measure to check the property
%% against. However, it does not exploit that the original counterexample
%% \ttp{cex} is already in a neighborhood of other likely counterexamples. So
%% rather than choose random vertexes, it may be more efficient to test other
%% values that are closely related to \ttp{cex}. By restricting the search to
%% \ttp{cex}'s neighborhood, a smaller counterexample might be missed, and only a
%% local minimum is returned. However, with a large set of values, finding a
%% global minimum by random sampling vertexes in the graph is possible but too
%% inefficient.
\subsection{Reduction Algorithm Overview}\label{sec:reduct}
The algorithm we present for efficiently searching for new counterexamples is an
instance of greedy breadth-first search over a tree structure that represents a
value. At each node, during the traversal, we generate arbitrary
structurally smaller sub-values and build a new value from that, leaving
the remainder of the tree unchanged. Intuitively, by a \emph{structurally
smaller value}, we mean one with fewer constructors. We continue until we
reach a fixed-point.
\begin{figure}[ht]
\lstset{caption={}
% ,xleftmargin=5ex,firstnumber=0,numbers=left
}
\begin{code}
getSize :: SubVal -> Size
getSize (SubVal a) = size a
newVals :: Size -> Int -> SubVal -> IO [SubVal]
newVals sz tries (SubVal a) =
replicateM tries s where
s = liftM SubVal (\undef{sizedArbitrary} sz a)
reduce :: SubTypes a
=> ScArgs -> (a -> Property) -> a -> IO a
reduce args prop cex = reduce' 1
where
reduce' idx
| Just v <- index cex idx
= do vs <- newVals (getSize v)
(scMaxReduce args) v
maybe (reduce' (idx+1)) (reduce args prop)
(test cex idx vs prop)
| otherwise = return cex
test :: SubTypes a => a -> Idx -> [SubVal]
-> (a -> Property) -> Maybe a
test cex idx vs prop = go vs
where
go [] = Nothing
go (v:vs') =
let cex' = replace cex idx v in
if \undef{pass} prop cex' then go vs'
else Just cex'
\end{code}
\caption{Counterexample reduction algorithm.\label{fig:reduction}}
\end{figure}
Figure~\ref{fig:reduction} shows the reduction algorithm. In this algorithm and
subsequent algorithms in the paper, functions in {\bf bold font} are left
undefined but their implementation is described in the text. The function
\ttp{reduce} takes flags to customize the algorithm's behavior, a counterexample
\ttp{cex}, and the property \ttp{prop}. The reduction begins at the first
proper sub-value of \ttp{cex}; call it \ttp{v}. When the index \ttp{idx}
becomes out-of-bounds and returns \ttp{Nothing}, the algorithm terminates.
Otherwise, a list of new random values are generated.
%
\begin{code}
sizedArbitrary :: SubTypes a => Size -> a -> IO a
\end{code}
%
\noindent
generates a new value \ttp{v'} having the same type as \ttp{v} and that is
strictly smaller (with respect to the \ttp{size} method) than \ttp{v}. Just
like QuickCheck's \ttp{arbitrary} method, \ttp{sizedArbitrary} generates
successively larger counterexamples when generating new values with which to
replace a sub-value.
The flag \ttp{scMaxReduce} is the maximum number of tries to discover a new
counterexample by replacing \ttp{v} in \ttp{cex} and testing it. The result of
\ttp{pass prop cex'} for
%
\begin{code}
pass :: (a -> Property) -> a -> Bool
\end{code}
%
\noindent
holds if \ttp{cex'} satisfies the property \ttp{prop}. The property may be a
conditional, in which case the value must pass the precondition as well as the
consequent for \ttp{pass} to return \ttp{True}. If no failure is found, we move
to the next sub-value of \ttp{cex} and continue. However, if a new smaller
counterexample \ttp{cex'} is found, we start a new breadth-first traversal of
\ttp{cex'}, attempting to shrink it further.
The algorithm is guaranteed to terminate: informally, the measure for the
function is that either the index increases or the size of the counterexample
being evaluated decreases. The algorithm's complexity is
$\mathcal{O}(n^2)$, where $n$ is the number of constructors in the
counterexample, assuming that generating new sub-values and testing them is done
in constant time.
\subsection{Reduction Algorithm Details}\label{sec:details}
Having described the reduction algorithm, there are two important details about
its design we describe below.
\subsubsection{Variant Counterexample Hypothesis}
A motivation for the design of the reduction algorithm is something we call the
\emph{variant counterexample hypothesis}: in the search space of possible values
from a given type \ttp{T}, if a known counterexample \ttp{cex} is a variant
\ttp{v} of \ttp{T}, then it is most probable that other counterexamples are also
from variant \ttp{v}. As an example supporting the hypothesis, consider a
property about unintended variable capture over a language's parse tree
represented by a sum type with constructors for module imports, function
definitions, and global-variable assignments, respectively. A function
definition counterexample can only be reduced to smaller function definition
counterexamples, the only construct in which variable capture is possible.
Recall that the algorithm begins at the \emph{first} sub-value of the
counterexample rather than the 0th sub-value. The 0th sub-value of \ttp{cex} is
\ttp{cex} itself. No invariant of the algorithm would be violated by beginning
with the 0th sub-value, and in particular, the algorithm would still terminate.
However, we begin with a strict sub-value because of the hypothesis.
The hypothesis may be incorrect for some properties, in which case SmartCheck
may potentially fail to discover a smaller counterexample. However, in
Sections~\ref{sec:existential} and~\ref{sec:precondition}, we describe an
approaches to generalize counterexamples based on discovering new counterexample
variants. These generalization techniques are executed in an (optional)
generalization phase, run after the reduction phase, in which this hypothesis is
implemented.
%% Another way to motivate the hypothesis is that searching the space of
%% possible counterexamples by generating \emph{any} well-typed value that is
%% strictly smaller than the original counterexample is something we have seen
%% before: it is precisely the naive approach to implementing \ttp{shrink}!
%% \subsubsection{Conditional Properties}
%% A QuickCheck property may be a conditional of the form
%% %
%% \begin{code}
%% prop a = pre a ==> prop' a
%% \end{code}
%% %
%% \noindent
%% such that \ttp{prop} is true if the precondition (\ttp{pre a}) is false or
%% (\ttp{prop' a}) holds. In practice, conditionals are used to restrict the set
%% of values tested; values that fail the precondition are discarded. Thus, the
%% call to \ttp{pass} in the reduction algorithm only considers a potential
%% counterexample \ttp{cex} to have failed if it passes the property's precondition
%% and fails the consequent.
\subsubsection{Opaque Types}\label{sec:base}
SmartCheck focuses on efficiently shrinking and generalizing large data
structures. It is not intended as a general replacement for QuickCheck's
\ttp{shrink} method. Consequently, SmartCheck ignores ``primitive'' types
without value constructors, such as \ttp{Char}, \ttp{Int}, and \ttp{Word16}.