/
preprocessor.clj
125 lines (106 loc) · 4.16 KB
/
preprocessor.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
(ns hiposfer.gtfs.preprocessor
"parse the Markdown GTFS spec definition and returns it as Clojure data structures.
Uses several heuristics to guess how to interpret the data.
Useful to avoid monkey patching"
(:require [clojure.string :as str]
[markdown2clj.core :as md]
[clojure.pprint :as pprint]
[clojure.edn :as edn]))
(def url "https://raw.githubusercontent.com/google/transit/master/gtfs/spec/en/reference.md")
;; NOTE: we purposedly discard the edge case of several nested headers as
;; it makes this function much more simpler
(defn sections
[md]
(let [prev (volatile! (first (:document md)))]
(partition-by (fn [v] (if (:heading v)
(vreset! prev v)
(deref prev)))
(:document md))))
;; (sections content)
(defn zipify
[section]
(let [table (some :table-block section)
head (some :table-head table)
body (some :table-body table)
headers (->> (tree-seq coll? seq head)
(filter string?)
(map str/lower-case)
(map #(str/replace % " " "-")) ;; otherwise not valid literal keyword
(map keyword))
rows (for [row (map :table-row body)]
(for [cell row]
(->> (tree-seq coll? seq cell)
(filter map?)
(filter :text)
(map :text)
(str/join ""))))]
(map #(zipmap %1 %2) (repeat headers) rows)))
;; example
;; (feed-files content)
(defn- header [section] (-> section first :heading second :text))
(def enum-edge-cases #{"wheelchair_boarding" "direction_id"
"wheelchair_accessible" "timepoint"
"payment_method" "transfers"
"exact_times" "bikes_allowed"})
(defn- enum?
[parent field]
(and (empty? (:field-name field))
(empty? (:required field))
(not (empty? (:details field)))
(or (str/ends-with? (:field-name parent) "_type")
(contains? enum-edge-cases
(:field-name parent)))))
(defn- enum-value
[text]
(if-let [[_ value description] (re-matches #"\* (\d) - (.*)" text)]
{:description description
:value (edn/read-string value)}
text))
(defn- parse-enums
([fields]
(parse-enums (rest fields) [] (first fields)))
([fields result parent]
(cond
(enum? parent (first fields))
(recur (rest fields)
result
(update parent :values conj (enum-value (:details (first fields)))))
(some? (:values parent))
(recur (rest fields)
(conj result (update parent :values reverse))
(first fields))
(and (empty? fields) (nil? parent)) result
(empty? fields) (conj result parent)
:else
(recur (rest fields) (conj result parent) (first fields)))))
(defn- tidy
[form]
(if (not (map? form)) form
(let [required (= (:required form) "Required")
details (str/lower-case (or (:details form) (:defines form)))
unique (str/includes? details "dataset unique")]
(merge form {:required required}
(when unique {:unique unique})))))
;; TODO: this is a bit outdated :(
(defn- parse
[raw]
(let [content (md/parse raw)
parts (sections content)
feed-files (some #(when (= "Feed Files" (header %)) %) parts)
feed-data (zipify feed-files)
files (filter #(when (str/ends-with? (header %) ".txt") %)
(sections content))
files-data (for [file files]
{:filename (header file)
:fields (->> (zipify file)
(parse-enums)
(remove #(and (empty? (:field-name %))
(empty? (:required %))))
(map tidy))})]
{:feed-files (map tidy feed-data)
:field-definitions files-data}))
(defn -main
[out]
(spit out (with-out-str (pprint/pprint (parse (slurp url))))))
;(parse (slurp url))
;(-main "resources/reference.edn")