-
Notifications
You must be signed in to change notification settings - Fork 18
/
generate.py
161 lines (124 loc) · 4.41 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
This script downloads source data from data.jmsv.me; this contains an unzipped
and filtered mirror of the http://www1.icsi.berkeley.edu/~demelo/etymwn dataset
Data was filtered using the script at: https://data.jmsv.me/etymwn-filterer.sh
"""
import csv
import hashlib
import os
import io
import json
import gc
import requests
import six
from clint.textui import progress
def prepare(source_dir):
"""
Create data source directory if not exists
"""
if not os.path.exists(source_dir):
os.makedirs(source_dir)
def download_dataset(url, dl_path):
"""
Download filtered etymwn from jmsv.me mirror, displaying progress bar
"""
r = requests.get(url, stream=True)
with open(dl_path, "wb") as f:
total_length = int(r.headers.get("content-length"))
chunk_size = 4096
for chunk in progress.bar(
r.iter_content(chunk_size=chunk_size),
expected_size=(total_length / chunk_size) + 1,
):
if chunk:
f.write(chunk)
f.flush()
print("Downloaded to " + dl_path)
def verify_local_data(url, dl_path):
"""
Compare actual file checksum with expected served checksum
Return bool determines whether or not data is (re)downloaded
:return: True if local file matches, otherwise False
"""
try:
with open(dl_path, "rb") as f:
# Local file checksum
actual = hashlib.md5(f.read()).hexdigest()
except EnvironmentError:
# Return False if file doesn't exit
return False
expected = requests.get("%s.checksum" % url).text.strip()
return actual == expected
def fix_anomalous_lang_code(code):
if code == "wit":
return "wnw"
return code
def split_elements(compound):
"""
Split source tsv elements at colon
e.g.: 'rel:etymology' => ['rel', 'etymology']
:return: Elements as list
"""
elements = [fix_anomalous_lang_code(e.strip()) for e in compound.split(":")]
if len(elements) == 2:
return elements
return [fix_anomalous_lang_code(elements[0]), ":".join(elements[1:])]
def generate_json(source_path, dir):
"""
Reads source tsv and restructures data as described:
https://github.com/jmsv/ety-python/issues/24
"""
result = {}
print("Loading source tsv")
with io.open(source_path, "r", newline="", encoding="utf-8") as source:
reader = csv.reader(source, delimiter="\t")
source_rows = list(reader)
gc.collect()
print("Structuring data")
for row in progress.bar(source_rows):
source_lang, source_word = split_elements(row[0])
if source_lang not in result:
result[source_lang] = {}
if source_word not in result[source_lang]:
result[source_lang][source_word] = []
dest_lang, dest_word = split_elements(row[2])
result[source_lang][source_word].append({dest_word: dest_lang})
del source_lang, source_word, dest_lang, dest_word
# Save data to seperate files for languages, may be required in the future
# print('Saving language files')
# for key in progress.bar(result):
# with io.open(os.path.join(dir, 'data/ety-%s.json' % key), 'w') as f:
# f.write(json.dumps(result[key], sort_keys=False))
# Save data
print("Writing etymologies file")
with io.open(os.path.join(dir, "etymologies.json"), "w") as f:
json.dump(result, f)
def main():
"""
Define paths, download data if required, generate json dataset
"""
dir = os.path.dirname(os.path.realpath(__file__))
source_dir = os.path.join(dir, "source")
source_path = os.path.join(source_dir, "etymwn.tsv")
source_url = "https://data.jmsv.me/etymwn-filtered.tsv"
# Exit if not Python 3
if not six.PY3:
print("Script should be run as Python 3, exiting")
exit(1)
prepare(source_dir)
# (Re)download data if required
if not verify_local_data(source_url, source_path):
print("Downloading source data")
download_dataset(source_url, source_path)
# If checksum still doesn't match, exit
if verify_local_data(source_url, source_path):
print("Verified local source data")
else:
print("Error verifying local source data, exiting")
exit(1)
else:
print("Verified local source data")
generate_json(source_path, dir)
print("Done")
if __name__ == "__main__":
main()