In [29]:
import csv
import logging
logging.basicConfig(format='%(message)s')
import requests
from bs4 import BeautifulSoup
import time
import re
import json
import datetime

In [30]:
def scrape_review(eigacom_id):
    page_num = 1
    
    data = {
        "title" : -1,
        "reviews" : []
    }
    
    rating_dict = {"val00":0.0, "val05":0.5,"val10":1.0,"val15":1.5,"val20":2.0,"val25":2.5,"val30":3.0,"val35":3.5,"val40":4.0,"val45":4.5,"val50":5.0}
    
    print("START : " + eigacom_id)
    url_review='https://eiga.com/movie/' + eigacom_id + '/review/all/'
    
    if url_review is None:
        logging.warning("**************************************************")
        logging.warning(q + " HAS NO RESULT")
        logging.warning("**************************************************")
        return None
    
    while(1):
        res = requests.get(url_review + str(page_num))
        res.encoding = res.apparent_encoding
        soup = BeautifulSoup(res.content, "lxml")
        
        if page_num == 1:
            title = soup.find('p', attrs={"class":"title-link"}).text
            data["title"] = title
            
        if soup.find('div', attrs={"class": "user-review"}) == None: # ページ数の上限を超えたら
            print('DONE : ' + eigacom_id )
            break
            
        for r in soup.find_all('div', attrs={"class": "user-review"}):
            review_title = r.find('h2',attrs={"class": "review-title"})
            title = review_title.find('a')
            
            rating_class = review_title.find('span',attrs={"class": "rating-star"}).get('class')[1]
            rating = rating_dict[rating_class]
            
            empathy = r.find('div', attrs={"class": "empathy"}).find(('strong')).text
            
            date= r.find('div',attrs={"class": "review-data"}).find('div',attrs={"class": "time"})
            main_text =  r.find('div',attrs={"class": "txt-block"})
            
            tgl_btn = main_text.find('div',attrs={"class": "toggle-btn"})
            if tgl_btn is not None:
                tgl_btn.decompose()
            
            item = {
                "date" : "",
                "rating" : rating,
                "empathy" : int(empathy),
                "review" : "",
            }
            
            review_text = title.text + "\n" +  main_text.text.replace("\n", "")
            item["review"] = review_text
            
            y, m, d, _ = re.split('[年月日]', date.text)
            item["date"] = str(datetime.date(int(y), int(m), int(d)))
            
            data["reviews"].append(item)
            
        page_num += 1
        time.sleep(1)
    return data

In [32]:
def main():
    
    data_all = {}
    
    movie_id = 1
    
    for year in range(1978, 2020):
        print(year)
        with open('./eigacom_nomination_id_table/{}.txt'.format(str(year)), 'r') as id_table:
            for line in csv.reader(id_table):
                if line == "\n":
                    continue
                eigacom_id, *_ = line
                
                print(movie_id)
                data = scrape_review(eigacom_id)
                
                if data == None:
                    movie_id += 1
                    continue
                data_all[str(movie_id)] = data
                movie_id += 1

    output_file = '../../data/eigacom_review.json'
    with open(output_file, 'w') as f:
        json.dump(data_all, f, ensure_ascii=False, indent=2)

In [33]:
if __name__ == "__main__":
    main()

1978
1
START : 55324
DONE : 55324
2
START : 37417
DONE : 37417
3
START : 37793
DONE : 37793
4
START : 38771
DONE : 38771
5
START : 38856
DONE : 38856
1979
6
START : 36838
DONE : 36838
7
START : 2625
DONE : 2625
8
START : 35871
DONE : 35871
9
START : 36604
DONE : 36604
10
START : 39774
DONE : 39774
1980
11
START : 39168
DONE : 39168
12
START : 68820
DONE : 68820
13
START : 37092
DONE : 37092
14
START : 37677
DONE : 37677
15
START : 39720
DONE : 39720
1981
16
START : 37867
DONE : 37867
17
START : 37809
DONE : 37809
18
START : 38100
DONE : 38100
19
START : 38463
DONE : 38463
20
START : 38898
DONE : 38898
1982
21
START : 35007
DONE : 35007
22
START : 35006
DONE : 35006
23
START : 35114
DONE : 35114
24
START : 38249
DONE : 38249
25
START : 38503
DONE : 38503
1983
26
START : 35661
DONE : 35661
27
START : 36010
DONE : 36010
28
START : 36013
DONE : 36013
29
START : 39560
DONE : 39560
30
START : 39868
DONE : 39868
1984
31
START : 38363
DONE : 38363
32
START : 7947
DONE : 7947
33
START : 36649
D