# incanter/incanter

### Subversion checkout URL

You can clone with HTTPS or Subversion.

Fetching contributors…

Cannot retrieve contributors at this time

190 lines (136 sloc) 6.377 kb
 ;; from the following blog post: ;; http://incanter-blog.org/2009/06/17/randomization-significance/ ;; significance testing with randomization (use '(incanter core stats datasets charts)) (def data (to-matrix (get-dataset :plant-growth))) ;; Break the first column of the data into groups based on ;; treatment type (second column) using the group-by function. (def groups (group-by data 1 :cols 0)) (t-test (first groups) :y (second groups)) ;; returns t-stat :t-stat 1.1912603818487033 (t-test (first groups) :y (last groups)) ;; returns t-stat -2.1340204531240676 ;; calculate the means of the three groups (map mean groups) ;; perform the t-test comparing the first two groups (def t1 (t-test (first groups) :y (second groups))) (def t2 (t-test (first groups) :y (last groups))) ;; view the test statistic and its p-value (:t-stat t1) ;; returns 1.1912603818487033 (:p-value t1) ;; returns 0.25038250858754796 (:t-stat t2) ;; returns -2.134 (:p-value t2) ;; returns 0.0479 ;; The above p-value is based on the assumption that the ;; distribution of t-test statistics when the null hypothesis, ;; that the two sample means are identical, is true. Let's see ;; if that assumption appears correct for these samples. ;; Now create 1000 permuted versions of the original two groups using ;; the sample-permutations function, (def perm-groups1 (sample-permutations 1000 (first groups) (second groups))) (def perm-groups2 (sample-permutations 1000 (first groups) (last groups))) ;; create a function, based on t-test, that takes two sequence and returns ;; just the :t-stat value (defn t-stat [x y] (:t-stat (t-test x :y y))) ;; calculate the t-test statistics for each of the permuted versions of ;; the two samples (def t-stats1 (map t-stat (first perm-groups1) (second perm-groups1))) (def t-stats2 (map t-stat (first perm-groups2) (second perm-groups2))) ;; plot the t-test statistics, and overlay the pdf of a t-distribution (doto (histogram t-stats1 :density true :nbins 20) (add-function #(pdf-t % :df (:df t1)) -3 3) view) ;; and view their mean, sd, and 95% confidence interval (mean t-stats1) ;; returns 0.02308030751953895 (sd t-stats1) ;; returns 1.0640114204888618 (quantile t-stats1 :probs [0.025 0.975]) ;; returns (-2.1164160713197497 2.002005620604495) (doto (histogram t-stats2 :density true :nbins 20) (add-function #(pdf-t % :df (:df t2)) -3 3) view) ;; and view their mean, sd, and 95% confidence interval (mean t-stats2) ;; returns -0.014 (sd t-stats2) ;; returns 1.054 (quantile t-stats2 :probs [0.025 0.975]) ;; returns (-2.075 2.122) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do it with raw means-differences instead of the t-test statistic ;; Define a function to calculate the difference in means between two sequences. (defn means-diff [x y] (minus (mean x) (mean y))) ;; Calculate the difference in sample means between the two groups. (def samp-mean-diff (means-diff (first groups) (second groups))) ;; and calculate the difference of means of the 1000 samples by mapping ;; the means-diff function, defined earlier, over the rows returned by ;; sample-permutations. (def perm-means-diffs1 (map means-diff (first perm-groups) (second perm-groups))) ;; Then take the mean of this sequence, which gives the proportion of times ;; that a value from the permuted sequences are more extreme than the original ;; sample mean (i.e. the p-value). (mean (indicator #(> % samp-mean-diff) perm-means-diffs1)) ;; Calculate a 95% interval of the permuted differences. If the original ;; sample means-difference is outside of this range, there is evidence ;; that the two means are statistically significantly different. (quantile perm-means-diffs1 :probs [0.025 0.975]) ;; Plot a histogram of the perm-means-diffs using the density option, ;; instead of the default frequency, and then add a normal pdf curve ;; with the mean and sd of perm-means-diffs data for a visual comparison. (doto (histogram perm-means-diffs1 :density true) (add-lines (range -1 1 0.01) (pdf-normal (range -1 1 0.01) :mean (mean perm-means-diffs1) :sd (sd perm-means-diffs1))) view) ;; now standardize the means and overlay the pdf of a t-distribution (doto (histogram (sweep (sweep perm-means-diffs1) :stat sd :fun div) :density true) (add-lines (range -3 3 0.01) (pdf-t (range -3 3 0.01) :df 18)) view) ;; The permuted data looks normal. Now, calculate the p-values for ;; the difference in means between the control and treatment two. (def perm-groups (sample-permutations 1000 (first groups) (last groups))) (def perm-means-diffs2 (map means-diff (first perm-groups) (second perm-groups))) (def samp-mean-diff (means-diff (first groups) (last groups))) (mean (indicator #(< % samp-mean-diff) perm-means-diffs2)) (quantile perm-means-diffs2 :probs [0.025 0.975]) ;; Finally, calculate the p-values for the difference in means ;; between the treatment one and treatment two. (def perm-groups (sample-permutations 1000 (second groups) (last groups))) (def perm-means-diffs3 (map means-diff (first perm-groups) (second perm-groups))) (def samp-mean-diff (means-diff (second groups) (last groups))) (mean (indicator #(< % samp-mean-diff) perm-means-diffs3)) (quantile perm-means-diffs3 :probs [0.025 0.975]) ;; Plot box-plots of the three perm-means-diffs sequences (doto (box-plot perm-means-diffs1) (add-box-plot perm-means-diffs2) (add-box-plot perm-means-diffs3) view)
Something went wrong with that request. Please try again.