Skip to content

Commit

Permalink
fixes #46 - remove clear duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
simongray committed Jan 23, 2023
1 parent e483fe6 commit c96df9b
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 13 deletions.
53 changes: 42 additions & 11 deletions src/main/dk/cst/dannet/db.clj
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@
(update-vals sets-only)
(vals))]
(filter (fn [ids]
(apply = (map #(dissoc (q/entity g %) :dns/inherited) ids)))
(apply = (map #(dissoc (q/entity-map g %) :dns/inherited) ids)))
candidates)))

(defn get-model
Expand All @@ -257,21 +257,26 @@
(.getGraph (get-model dataset model-uri)))

(defn remove!
"Remove a `triple` from the Apache Jena `model`."
"Remove a `triple` from the Apache Jena `model`.
NOTE: as with Aristotle queries, using _ works as a wildcard value."
[^Model model [s p o :as triple]]
(.removeAll
model
(ResourceFactory/createResource (voc/uri-for s))
(ResourceFactory/createProperty (voc/uri-for p))
(cond
(keyword? o)
(ResourceFactory/createResource (voc/uri-for o))
(when (not= s '_)
(ResourceFactory/createResource (voc/uri-for s)))
(when (not= p '_)
(ResourceFactory/createProperty (voc/uri-for p)))
(when (not= o '_)
(cond
(keyword? o)
(ResourceFactory/createResource (voc/uri-for o))

(instance? LangStr o)
(ResourceFactory/createLangLiteral (str o) (.lang o))
(instance? LangStr o)
(ResourceFactory/createLangLiteral (str o) (.lang o))

:else
(ResourceFactory/createTypedLiteral o))))
:else
(ResourceFactory/createTypedLiteral o)))))

(h/defn add-bootstrap-import!
"Add the `bootstrap-imports` of the old DanNet CSV files to a Jena `dataset`."
Expand Down Expand Up @@ -369,6 +374,32 @@
(remove! dn-model [?synset :wn/hyponym ?synset])
(remove! dn-model [?synset :wn/hypernym ?synset]))))

;; Remove duplicate synsets (identical predicate-object pairs).
;; The lowest index synset is kept in every case; other synsets are removed.
;; Referencing triples and generated inheritance triples are also removed.
(let [duplicates (find-duplicates dn-graph)
synset-ids (set (mapcat (comp rest sort) duplicates))
find-inherited (fn [synset-id]
(->> [:bgp [synset-id :dns/inherited '?inherited]]
(q/run-basic dn-graph '[?inherited])))
inherit-triples (->> synset-ids
(mapcat find-inherited)
(map (fn [[inherit-id]]
[inherit-id '_ '_]))
(doall))
synset-triples (for [synset synset-ids]
[synset '_ '_])
reference-triples (for [synset synset-ids]
['_ '_ synset])]
(println "Removing" (count synset-ids) "duplicate synset-ids...")
(txn/transact-exec dn-model
(doseq [triple synset-triples]
(remove! dn-model triple))
(doseq [triple inherit-triples]
(remove! dn-model triple))
(doseq [triple reference-triples]
(remove! dn-model triple))))

;; In the sentiment data, several thousand senses do not have sense-level
;; sentiment data. In those case we can try to synthesize from the words
;; that *do* have.
Expand Down
20 changes: 18 additions & 2 deletions src/main/dk/cst/dannet/query.clj
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

(declare entity)
(declare run)
(declare run-basic)

(defn- nav-subjects
"Helper function for 'nav-meta'."
Expand Down Expand Up @@ -103,6 +104,16 @@
(filter #(not (get raw-result (select-keys % '[?s ?p ?o]))))
(basic-entity))))

(defn entity-triples
[g subject]
(when-let [result (run-basic g op/entity {'?s subject})]
(map (juxt '?s '?p '?o) result)))

(defn entity-map
[g subject]
(when-let [result (run-basic g op/entity {'?s subject})]
(basic-entity result)))

(defn entity
"Return the entity description of `subject` in Graph `g`."
[g subject]
Expand Down Expand Up @@ -180,12 +191,17 @@
:subject subject))
(with-meta {} {:subject subject})))

(defn run-basic
"Same as 'run' below, but doesn't attach Navigable metadata."
[g & remaining-args]
(txn/transact g
(apply q/run g remaining-args)))

(defn run
"Wraps the 'run' function from Aristotle, providing transactions when needed.
The results are also made Navigable using for use with e.g. Reveal or REBL."
[g & remaining-args]
(->> (txn/transact g
(apply q/run g remaining-args))
(->> (apply run-basic g remaining-args)
(map #(vary-meta % merge (nav-meta g)))))

(defn table-query
Expand Down
1 change: 1 addition & 0 deletions src/main/dk/cst/dannet/web/resources.clj
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@
(comment
(q/expanded-entity (:graph @db) :dn/form-11029540-land)
(q/expanded-entity (:graph @db) :dn/synset-4849)
(q/entity-triples (:graph @db) :dn/synset-4849)

;; 51 cases of true duplicates
(count (db/find-duplicates (:graph @db)))
Expand Down

0 comments on commit c96df9b

Please sign in to comment.