Skip to content

Commit

Permalink
Added naive left-join implementation, and test.
Browse files Browse the repository at this point in the history
  • Loading branch information
tom committed Mar 12, 2020
1 parent 2d5398b commit eb11001
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 3 deletions.
3 changes: 2 additions & 1 deletion src/tech/ml/dataset.clj
Expand Up @@ -64,7 +64,8 @@
->>dataset
from-prototype
dataset->string
join-by-column)
join-by-column
left-join)


(par-util/export-symbols tech.ml.dataset.impl.dataset
Expand Down
28 changes: 27 additions & 1 deletion src/tech/ml/dataset/base.clj
Expand Up @@ -11,7 +11,8 @@
[tech.io :as io]
[tech.parallel.require :as parallel-req]
[tech.parallel.for :as parallel-for]
[tech.parallel.utils :as par-util])
[tech.parallel.utils :as par-util]
[clojure.set])
(:import [java.io InputStream]
[tech.v2.datatype ObjectReader]
[java.util List]
Expand Down Expand Up @@ -599,3 +600,28 @@ the correct type."
(map #(ds-col/select % rhs-indexes)))]
(from-prototype lhs "join-table" (concat lhs-cols rhs-cols)))
:rhs-missing-indexes rhs-missing}))


(defn left-join
"Like tech.ml.dataset.base/join-by-column, except we include the
entries from lhs that did not make the join in the final
result, filling them with column-appropriate missing values."
[colname lhs rhs]
(let [inner (-> (join-by-column colname lhs rhs) :join-table)
unused (clojure.set/difference
(set (lhs colname))
(set (inner colname)))
leftover (->> (lhs colname)
(map-indexed (fn [idx v]
(when (unused v) idx)))
(filter identity)
(select lhs (column-names lhs)))
n (ds-row-count leftover)
new-columns (->> (clojure.set/difference
(set (column-names inner))
(set (column-names lhs)))
(map (fn [name]
(ds-col/empty-column name
(dtype/get-datatype
(column inner name)) n ))))]
(ds-concat inner (reduce add-column leftover new-columns))))
7 changes: 7 additions & 0 deletions src/tech/ml/dataset/column.clj
Expand Up @@ -210,6 +210,13 @@ Implementations should check their metadata before doing calculations."
scanned-missing :missing} (ensure-column-reader data)]
(col-impl/new-column name coldata metadata (or missing scanned-missing)))))

(defn empty-column
"Like tech.ml.dataset.column/new-column, except we populate
the column with empty values corresponding to the underlying
datatype specified by dtype."
[name dtype n]
;;TODO change the missing set to a flyweight set or something...
(new-column name (col-impl/make-container dtype n) nil (set (range n))))

(defn ensure-column
"Convert an item to a column if at all possible. Currently columns either implement
Expand Down
4 changes: 3 additions & 1 deletion src/tech/ml/dataset/impl/column.clj
Expand Up @@ -37,7 +37,8 @@
:string (make-string-table n-elems "")
:text (let [list-data (ArrayList.)]
(dotimes [iter n-elems]
(.add list-data "")))
(.add list-data ""))
list-data)
(dtype/make-container :list dtype n-elems)))
([dtype]
(make-container dtype 0)))
Expand Down Expand Up @@ -87,6 +88,7 @@
(.read rdr idx))))))



(deftype Column
[^Set missing
data
Expand Down
53 changes: 53 additions & 0 deletions test/tech/ml/dataset/join_test.clj
Expand Up @@ -20,6 +20,59 @@
(is (empty? (seq rhs-missing)))))


;;sample from https://www.w3schools.com/sql/sql_join_left.asp
(deftest left-join-test
(let [lhs (ds/->dataset [{"CustomerID" 1,
"CustomerName" "Alfreds Futterkiste",
"ContactName" "Maria Anders",
"Address" "Obere Str. 57",
"City" "Berlin",
"PostalCode" 12209,
"Country" "Germany"}
{"CustomerID" 2,
"CustomerName" "Ana Trujillo Emparedados y helados",
"ContactName" "Ana Trujillo",
"Address" "Avda. de la Constitución 2222",
"City" "México D.F.",
"PostalCode" 5021,
"Country" "Mexico"}
{"CustomerID" 3,
"CustomerName" "Antonio Moreno Taquería",
"ContactName" "Antonio Moreno",
"Address" "Mataderos 2312",
"City" "México D.F.",
"PostalCode" 5023,
"Country" "Mexico"}])

rhs (ds/->dataset [{"OrderID" 10308,
"CustomerID" 2,
"EmployeeID" 7,
"OrderDate" "1996-09-18",
"ShipperID" 3}
{"OrderID" 10309,
"CustomerID" 37,
"EmployeeID" 3,
"OrderDate" "1996-09-19",
"ShipperID" 1}
{"OrderID" 10310,
"CustomerID" 77,
"EmployeeID" 8,
"OrderDate" "1996-09-20",
"ShipperID" 2}])
joined (ds/left-join "CustomerID" lhs rhs)
recs (ds/mapseq-reader joined)
empty-int? #{-32768}
empty-string? #{""}
empty-val? #(or (empty-int? %) (empty-string? %))
realized (some #(when (= (get % "CustomerID") 2) %) recs)
unrealized (filter #(not= % realized) recs)]
(is (every? (complement empty-val?) (vals realized))
"Ana's record should be fully realized.")
(is (every? identity
(for [{:strs [OrderID OrderDate ShipperID]}
unrealized]
(every? empty-val? [OrderID OrderDate ShipperID])))
"Everyone else should have missing entries from RHS.")))


(comment
Expand Down

0 comments on commit eb11001

Please sign in to comment.