-
Notifications
You must be signed in to change notification settings - Fork 0
/
anime_title_en2ja.py
96 lines (77 loc) · 2.33 KB
/
anime_title_en2ja.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from selenium import webdriver
import time
from tqdm import tqdm
import chromedriver_binary
import pickle
from bs4 import BeautifulSoup
import argparse as ap
import os
import re
def get_driver():
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
return driver
def get_title(driver,url,en_title,tag):
driver.get(f"{url} {en_title}")
print(f"{url} {en_title}")
time.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
title = soup.select(tag)
print(title)
return title
def scraper(driver,en_title):
url = 'https://www.google.co.jp/search?num=1&q=アニメ'
tag = "h2.qrShPb.kno-ecr-pt.PZPZlf.mfMhoc> span"
title = get_title(driver,url,en_title,tag)
if len(title) != 0:
title = title[0].get_text()
print(f"[1] en:{en_title} || ja:{title}")
return title
else:
url = 'https://www.google.co.jp/search?num=1&q=wikipedia'
tag = "h3.LC20lb.DKV0Md"
title = get_title(driver,url,en_title,tag)
if len(title) == 0:
return en_title
else:
title = title[0].get_text()
if title.find(' - ウィキペディア') >= 0:
title = title.replace(' - ウィキペディア',"")
if title.find(' - Wikipedia') >= 0:
title = title.replace(' - Wikipedia',"")
else:
print('エラー:検索失敗 - ', en_title,title)
return title
# カッコの削除
if title.find("("):
title = title.split('(')[0]
if title.find(" ("):
title = title.split(' (')[0]
print(f"[2] en:{en_title} || ja:{title}")
return title
driver = get_driver()
with open("id2anime.pkl","rb") as f:
id2anime = pickle.load(f)
def load_pkl():
with open('en2ja.pkl','rb') as f:
a = pickle.load(f)
return a
def output(en2ja,name = "en2ja.pkl"):
print("save pickle")
with open(name, 'wb') as f:
pickle.dump(en2ja, f)
def first():
en2ja = {}
cnt = 0
for k,v in tqdm(id2anime.items()):
en2ja[v] = scraper(driver, v)
time.sleep(4.2)
cnt+=1
if cnt % 30 == 0:
time.sleep(10)
if cnt % 100 == 0:
output(en2ja)
output(en2ja)
first()