forked from techascent/tech.ml.dataset
/
dataset.clj
233 lines (214 loc) · 9.79 KB
/
dataset.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
(ns tech.ml.dataset
"Column major dataset abstraction for efficiently manipulating
in memory datasets."
(:require [tech.v2.datatype :as dtype]
[tech.parallel.utils :as par-util]
[tech.v2.datatype.functional :as dfn]
[tech.ml.dataset.column :as ds-col]
[tech.ml.dataset.categorical :as categorical]
[tech.ml.dataset.pipeline.column-filters :as col-filters]
[tech.ml.dataset.impl.dataset :as ds-impl]
[tech.ml.dataset.base]
[tech.ml.dataset.modelling]
[tech.ml.dataset.math]
[tech.v2.datatype.casting :as casting]
[clojure.math.combinatorics :as comb])
(:import [smile.clustering KMeans GMeans XMeans PartitionClustering]))
(set! *warn-on-reflection* true)
(par-util/export-symbols tech.ml.dataset.base
dataset-name
set-dataset-name
ds-row-count
ds-column-count
metadata
set-metadata
maybe-column
column
columns
column-map
column-names
columns-with-missing-seq
add-column
new-column
remove-column
remove-columns
update-column
order-column-names
update-columns
rename-columns
select
select-columns
add-or-update-column
value-reader
mapseq-reader
ds-group-by
ds-group-by-column
group-by->indexes
group-by-column->indexes
ds-sort-by
ds-sort-by-column
ds-filter
ds-filter-column
unique-by
unique-by-column
aggregate-by
aggregate-by-column
ds-concat
ds-take-nth
ds-map-values
ds-column-map
->dataset
->>dataset
from-prototype
dataset->string
join-by-column
left-join)
(par-util/export-symbols tech.ml.dataset.impl.dataset
new-dataset
name-values-seq->dataset)
(defn n-permutations
"Return n datasets with all permutations n of the columns possible.
N must be less than (count (columns dataset))."
[dataset n]
(when-not (< n (first (dtype/shape dataset)))
(throw (ex-info (format "%d permutations of %d columns"
n (first (dtype/shape dataset)))
{})))
(->> (comb/combinations (column-names dataset) n)
(map set)
;;assume order doesn't matter
distinct
(map (partial select-columns dataset))))
(defn n-feature-permutations
"Given a dataset with at least one inference target column, produce all datasets
with n feature columns and the label columns."
[dataset n]
(let [label-columns (col-filters/target? dataset)
feature-columns (col-filters/not label-columns dataset)]
(when-not (seq label-columns)
(throw (ex-info "No label columns indicated" {})))
(->> (comb/combinations feature-columns n)
(map set)
;;assume order doesn't matter
distinct
(map (comp (partial select-columns dataset)
(partial concat label-columns))))))
(par-util/export-symbols tech.ml.dataset.modelling
set-inference-target
column-label-map
inference-target-label-map
dataset-label-map
inference-target-label-inverse-map
num-inference-classes
feature-ecount
model-type
column-values->categorical
reduce-column-names
has-column-label-map?
->k-fold-datasets
->train-test-split
->row-major)
(par-util/export-symbols tech.ml.dataset.math
correlation-table
k-means
g-means
x-means
compute-centroid-and-global-means
impute-missing-by-centroid-averages)
(defn descriptive-stats
"Get descriptive statistics across the columns of the dataset.
In addition to the standard stats"
[dataset]
(let [stat-names [:col-name :datatype :n-valid :n-missing
:mean :mode :min :max :standard-deviation :skew]
stats-ds
(->> (->dataset dataset)
(pmap (fn [ds-col]
(let [n-missing (count (ds-col/missing ds-col))
n-valid (- (dtype/ecount ds-col)
n-missing)
col-dtype (dtype/get-datatype ds-col)
col-reader (dtype/->reader ds-col
col-dtype
{:missing-policy :elide})]
(merge
{:col-name (ds-col/column-name ds-col)
:datatype col-dtype
:n-valid n-valid
:n-missing n-missing}
(if (and (not (:categorical? (ds-col/metadata ds-col)))
(casting/numeric-type? col-dtype))
(dfn/descriptive-stats col-reader
#{:min :mean :max
:standard-deviation :skew})
{:mode (->> col-reader
frequencies
(sort-by second >)
ffirst)})))))
(sort-by :col-name)
->dataset)
existing-colname-set (->> (column-names stats-ds)
set)]
;;This orders the columns by the ordering of stat-names but if for instance
;;there were no numeric or no string columns it still works.
(select-columns stats-ds (->> stat-names
(filter existing-colname-set)))))
(defn ->flyweight
"Convert dataset to seq-of-maps dataset. Flag indicates if errors should be thrown
on missing values or if nil should be inserted in the map. IF a label map is passed
in then for the columns that are present in the label map a reverse mapping is done
such that the flyweight maps contain the labels and not their encoded values."
[dataset & {:keys [column-name-seq
error-on-missing-values?
number->string?]
:or {error-on-missing-values? true}}]
(let [label-map (when number->string?
(dataset-label-map dataset))
target-columns-and-vals
(->> (or column-name-seq
(->> (columns dataset)
(map ds-col/column-name)
((fn [colname-seq]
(if number->string?
(reduce-column-names dataset colname-seq)
colname-seq)))))
(map (fn [colname]
{:column-name colname
:column-values
(if (contains? label-map colname)
(let [retval
(categorical/column-values->categorical
dataset colname label-map)]
retval)
(let [current-column (column dataset colname)]
(when (and error-on-missing-values?
(not= 0 (count (ds-col/missing current-column))))
(throw (ex-info (format "Column %s has missing values"
(ds-col/column-name current-column))
{})))
(dtype/->reader current-column)))})))]
;;Transpose the sequence of columns into a sequence of rows
(->> target-columns-and-vals
(map :column-values)
(apply interleave)
(partition (count target-columns-and-vals))
;;Move to flyweight
(map zipmap
(repeat (map :column-name target-columns-and-vals))))))
(defn labels
"Given a dataset and an options map, generate a sequence of label-values.
If label count is 1, then if there is a label-map associated with column
generate sequence of labels by reverse mapping the column(s) back to the original
dataset values. If there are multiple label columns results are presented in
flyweight (sequence of maps) format."
[dataset]
(when-not (seq (col-filters/target? dataset))
(throw (ex-info "No label columns indicated" {})))
(let [original-label-column-names (->> (col-filters/inference? dataset)
(reduce-column-names dataset))
flyweight-labels (->flyweight dataset
:column-name-seq original-label-column-names
:number->string? true)]
(if (= 1 (count original-label-column-names))
(map #(get % (first original-label-column-names)) flyweight-labels)
flyweight-labels)))