Browse files

Rearrange where summary functions live, and test against a real dataset

  • Loading branch information...
1 parent c8be784 commit 235077157ebdf88a4b6dad10fa2bdcec7ae90da5 @kittylyst kittylyst committed Oct 16, 2011
View
60 modules/incanter-core/src/incanter/core.clj
@@ -2527,66 +2527,6 @@ altering later ones."
([] (System/exit 0)))
-(defn- count-col-types
- "Takes in a column name or number and a dataset. Returns a raw count of each type present in that column. Counts nils."
- ([col ds]
- (frequencies (map type ($ col ds)))))
-
-
-(defn- stat-summarizable
- "Placeholder stub function, for more advanced cases where we want to automatically ignore occasional bad values in a column."
- ([types]
- "Statistical summarizablity is currently stubbed out. Please contact the dev team if you're seeing this message."))
-
-
-(defn numeric-col-summarizer
- "Returns a summarizer function which takes a purely numeric column with no non-numeric values"
- ([col ds] {:min (reduce min ($ col ds)) :max (reduce max ($ col ds)) :mean (mean ($ col ds)) :median (median ($ col ds)) :is-numeric true}))
-
-
-(defn category-col-summarizer
- "Returns a summarizer function which takes a category column and returns a list of the top 5 columns by volume, and a
- count of remaining rows"
- ([col ds] (let [freqs (frequencies ($ col ds)) top-5 (take 5 (reverse (sort-by val freqs)))]
- (into {:count (- (reduce + (map val freqs)) (reduce + (map val (into {} top-5)))) :is-numeric false} top-5))))
-
-
-(defn choose-singletype-col-summarizer
- "Takes in a type, and returns a suitable column summarizer"
- ([col-type]
- (if (.isAssignableFrom java.lang.Number col-type)
- numeric-col-summarizer
- (if (or (.isAssignableFrom java.lang.String col-type) (.isAssignableFrom clojure.lang.Keyword col-type))
- category-col-summarizer
- ; FIXME Deal with date columns
- (str "Don't know how to summarize a column of type: " col-type)
- ))))
-
-
-(defn summarizer-fn
- "Takes in a column (number or name) and a dataset. Returns a function to summarize the column if summarizable, and a
- string describing why the column can't be summarized in the event that it can't"
- ([col ds]
- (let [type-counts (dissoc (count-col-types col ds) nil)]
- (if (= 1 (count type-counts))
- (choose-singletype-col-summarizer (nth (keys type-counts) 0))
- (if (every? #(.isAssignableFrom java.lang.Number %) (keys type-counts))
- numeric-col-summarizer
- (if (and (= 2 (count type-counts)) (contains? type-counts java.lang.String) (contains? type-counts clojure.lang.Keyword))
- category-col-summarizer
- (stat-summarizable type-counts)))))))
-
-(defn summary
- "Takes in a dataset. Returns a summary of that dataset (as a map of maps), having automatically figured out the relevant
- datatypes of columns. Will be slightly forgiving of mangled data in columns."
- ([ds]
- (let [cols (:column-names ds)]
- (map #(let [r (summarizer-fn %1 ds)]
- (if (fn? r)
- (r %1 ds)
- r)) cols))))
-
-
(defmulti save
" Save is a multi-function that is used to write matrices, datasets and
charts (in png format) to a file.
View
79 modules/incanter-core/src/incanter/stats.clj
@@ -39,7 +39,7 @@
(incanter Weibull))
(:use [clojure.contrib.map-utils :only [deep-merge-with]])
(:use [clojure.set :only [difference intersection union]])
- (:use [incanter.core :only (abs plus minus div mult mmult to-list bind-columns
+ (:use [incanter.core :only ($ abs plus minus div mult mmult to-list bind-columns
gamma pow sqrt diag trans regularized-beta ncol
nrow identity-matrix decomp-cholesky decomp-svd
matrix length sum sum-of-squares sel matrix?
@@ -2475,6 +2475,83 @@ Test for different variances between 2 samples
:E E})))
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Dataset Summarization Functions
+
+(defn- count-col-types
+ "Takes in a column name or number and a dataset. Returns a raw count of each type present in that column. Counts nils."
+ ([col ds]
+ (frequencies (map type ($ col ds)))))
+
+
+(defn- stat-summarizable
+ "Placeholder stub function, for more advanced cases where we want to automatically ignore occasional bad values in a column."
+ ([types]
+ ; FIXME Add the column name
+ "Statistical summarizablity is currently stubbed out. Please contact the dev team if you're seeing this message."))
+
+
+(defn numeric-col-summarizer
+ "Returns a summarizer function which takes a purely numeric column with no non-numeric values"
+ ([col ds]
+ {:col col :min (reduce min (remove nil? ($ col ds))) :max (reduce max (remove nil? ($ col ds)))
+ :mean (mean (remove nil? ($ col ds))) :median (median (remove nil? ($ col ds))) :is-numeric true}))
+
+
+(defn category-col-summarizer
+ "Returns a summarizer function which takes a category column and returns a list of the top 5 columns by volume, and a
+ count of remaining rows"
+ ([col ds]
+ (let [freqs (frequencies ($ col ds)) top-5 (take 5 (reverse (sort-by val freqs)))]
+ (into {:col col :count (- (reduce + (map val freqs)) (reduce + (map val (into {} top-5)))) :is-numeric false} top-5))))
+
+
+(defn choose-singletype-col-summarizer
+ "Takes in a type, and returns a suitable column summarizer"
+ ([col-type]
+ (if (.isAssignableFrom java.lang.Number col-type)
+ numeric-col-summarizer
+ (if (or (.isAssignableFrom java.lang.String col-type) (.isAssignableFrom clojure.lang.Keyword col-type))
+ category-col-summarizer
+ ; FIXME Deal with date columns
+ (str "Don't know how to summarize a column of type: " col-type)
+ ))))
+
+
+(defn summarizer-fn
+ "Takes in a column (number or name) and a dataset. Returns a function to summarize the column if summarizable, and a
+ string describing why the column can't be summarized in the event that it can't"
+ ([col ds]
+ (let [type-counts (dissoc (count-col-types col ds) nil)]
+ (if (= 1 (count type-counts))
+ (choose-singletype-col-summarizer (nth (keys type-counts) 0))
+ (if (every? #(.isAssignableFrom java.lang.Number %) (keys type-counts))
+ numeric-col-summarizer
+ (if (and (= 2 (count type-counts)) (contains? type-counts java.lang.String) (contains? type-counts clojure.lang.Keyword))
+ category-col-summarizer
+ (stat-summarizable type-counts)))))))
+
+(defn summarizable?
+ "Takes in a column name (or number) and a dataset. Returns true if the column can be summarized, and false otherwise"
+ ([col ds]
+ (fn? (summarizer-fn col ds))))
+
+
+(defn summary
+ "Takes in a dataset. Returns a summary of that dataset (as a map of maps), having automatically figured out the relevant
+ datatypes of columns. Will be slightly forgiving of mangled data in columns."
+ ([ds]
+ (let [cols (:column-names ds)]
+ (map #(let [r (summarizer-fn %1 ds)]
+ (if (fn? r)
+ (r %1 ds)
+ r)) cols))))
+
+; (def amt-fn (summarizer-fn (keyword "Amount Funded By Investors") loans))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
(defn principal-components
View
24 modules/incanter-core/test/incanter/core_tests.clj
@@ -36,19 +36,6 @@
(def dataset5 (dataset ["a" "b" "c"] [{"a" 1 "b" 2 "c" 3} {"b" 5 "c" 6}]))
(def dataset6 (dataset [:a :b :c] [[1 2 3]]))
-
-(def summary-ds0 (to-dataset [[1] [4] [7]]))
-(def summary-ds1 (to-dataset [[1] [3.142] [7]]))
-(def summary-ds2 (to-dataset [["a"] ["b"] ["c"]]))
-(def summary-ds3 (to-dataset [[:a] [:b] [:c]]))
-(def summary-ds4 (to-dataset [[:a] ["b"] [:c]]))
-(def summary-ds5 (to-dataset [[1] [2.1] [:c]]))
-(def summary-ds6 (to-dataset [[1] [2.1] ["c"]]))
-(def summary-ds7 (to-dataset [[1] [2.1] [nil]]))
-
-(def summary-ds8 (to-dataset [["a"] ["b"] ["c"] ["d"] ["b"] ["e"] ["a"] ["b"] ["f"] ["a"] ["b"] ["e"]]))
-
-
(deftest dataset-tests
(is (= (sel dataset1 :cols :a) [1 4]))
(is (= (sel dataset2 :cols :b) [2 5]))
@@ -65,17 +52,6 @@
(is (= (transform-col dataset1 :b (partial + 10))) (dataset [:a :b :c] [[1 12 3] [4 15 6]]))
(is (= (transform-col dataset1 :b * 2) (dataset [:a :b :c] [[1 4 3] [4 10 6]]))))
-(deftest summary-datasets
- (is (summarizable? 0 summary-ds0))
- (is (summarizable? 0 summary-ds1))
- (is (summarizable? 0 summary-ds2))
- (is (summarizable? 0 summary-ds3))
- (is (summarizable? 0 summary-ds4))
- (is (not (summarizable? 0 summary-ds5)))
- (is (not (summarizable? 0 summary-ds6)))
- (is (summarizable? 0 summary-ds7))
- )
-
;; define a simple matrix for testing
(def A (matrix [[1 2 3]
[4 5 6]
View
26 modules/incanter-core/test/incanter/stats_tests.clj
@@ -65,7 +65,20 @@
(def y (sel test-mat :cols 1))
(def dataset1 (dataset [:a :b :c] [[1 2 3] [4 5 6] [7 8 9] [10 11 12]]))
-
+
+(def summary-ds0 (to-dataset [[1] [4] [7]]))
+(def summary-ds1 (to-dataset [[1] [3.142] [7]]))
+(def summary-ds2 (to-dataset [["a"] ["b"] ["c"]]))
+(def summary-ds3 (to-dataset [[:a] [:b] [:c]]))
+(def summary-ds4 (to-dataset [[:a] ["b"] [:c]]))
+(def summary-ds5 (to-dataset [[1] [2.1] [:c]]))
+(def summary-ds6 (to-dataset [[1] [2.1] ["c"]]))
+(def summary-ds7 (to-dataset [[1] [2.1] [nil]]))
+
+(def summary-ds8 (to-dataset [["a"] ["b"] ["c"] ["d"] ["b"] ["e"] ["a"] ["b"] ["f"] ["a"] ["b"] ["e"]]))
+(def summary-ds9 (to-dataset [["a" 1.2] [":b" 3] [:c 0.1] ["d" 8] ["b" 9] ["e" 7.21] ["a" 1E1] ["b" 6.0000] ["f" 1e-2] ["a" 3.0] ["b" 4] ["e" 5]]))
+
+
(deftest mean-test
(is (= (map mean (trans test-mat)) [108.0 130.0])))
@@ -252,4 +265,15 @@
(tanimoto-coefficient [2 4 3 1 6]
[3 5 1 2 5]))))
+(deftest summary-datasets
+ (is (summarizable? 0 summary-ds0))
+ (is (summarizable? 0 summary-ds1))
+ (is (summarizable? 0 summary-ds2))
+ (is (summarizable? 0 summary-ds3))
+ (is (summarizable? 0 summary-ds4))
+ (is (not (summarizable? 0 summary-ds5)))
+ (is (not (summarizable? 0 summary-ds6)))
+ (is (summarizable? 0 summary-ds7))
+ )
+

0 comments on commit 2350771

Please sign in to comment.