-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_ancient_img_url_list.py
60 lines (51 loc) · 1.83 KB
/
make_ancient_img_url_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import csv
import logging
import re
import urllib.request
from socket import timeout
from bs4 import BeautifulSoup
def make_ancient_img_url_list():
character_list = []
with open("data/source.csv", "r", encoding="UTF8") as csvfile:
reader = csv.reader(
csvfile, delimiter="\t", quotechar="|", quoting=csv.QUOTE_MINIMAL
)
url_list = []
for row in reader:
character = row[0]
char_decoded = urllib.parse.quote(character)
url = f"http://www.zdic.net/hans/{char_decoded}"
url_list.append(url)
character_list.append(character)
url_img_list = []
character_list_copy = []
while url_list:
url = url_list.pop(0)
char = character_list.pop(0)
try:
page = urllib.request.urlopen(url, timeout=10).read()
except timeout:
logging.error("socket timed out - URL %s", url)
url_list.append(url)
character_list.append(char)
continue
else:
logging.info("Access successful.")
soup = BeautifulSoup(page, "html.parser")
img = soup.find(attrs={"class": "lazy ypic"})
if not img:
logging.error("No Image - URL %s", url)
character_list_copy.append(char)
url_img_list.append("")
continue
url_img = re.sub(r"//", "http://", img.attrs["data-original"])
url_img_list.append(url_img)
character_list_copy.append(char)
with open("ancient_img_url.csv", "w", encoding="UTF8") as csvfile:
writer = csv.writer(
csvfile, delimiter="\t", quotechar="|", quoting=csv.QUOTE_MINIMAL
)
for char, url in zip(character_list_copy, url_img_list):
writer.writerow([char, url])
if __name__ == "__main__":
make_ancient_img_url_list()