In [6]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import re
import json

In [18]:
def get_page_soup_from_href(href):
    url = 'https://filmarks.com' + str(href)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

In [29]:
def get_movie_ids_from_list_page_soup(list_page_soup):
    """
    get movie_id from list_page_soup
    """
    movies = list_page_soup.find_all('div', class_="p-movie-cassette js-movie-cassette")
    
    if not movies:
        return False
    movie_ids = []
    
    for i in range(len(movies)):
        data_clip = movies[i].get('data-clip')
        movie_id = int(re.match(r'^.*"movie_id":(\d{5}).*', data_clip).group(1))
        movie_ids.append(movie_id)
        
    return movie_ids

In [30]:
def get_all_movie_ids(base_url):
    """
    get all movie ids from base url 
    (ex. https://filmarks.com/list/year/2010s/2018)
    """
    all_movie_ids = []
    page_number = 1
    while True :
        # request GET
        if page_number == 1:
            page = requests.get(base_url)
        else:
            payload = {'page': '{}'.format(page_number)}
            page = requests.get(base_url, params=payload)
        list_page_soup = BeautifulSoup(page.content, 'html.parser')
        
        movie_ids = get_movie_ids_from_list_page_soup(list_page_soup)
        if not movie_ids:
            break
        else:
            all_movie_ids.extend(movie_ids)
            page_number += 1
    return all_movie_ids

In [17]:
def get_reviews_from_individual_page_soup(individual_page_soup):
    """
    parse indivisual_page_soup and return reviews's list
    """
    reviews_obj = individual_page_soup.find_all('div', class_="p-mark__review")
    if not reviews_obj:
        return False
    else:
        return [str(reviews_obj[i].text) for i in range(len(reviews_obj))]

In [32]:
def get_reviews_from_movie_id(movie_id):
    """
    get all reviews tied to a given movie_id
    and return a list of them
    """
    all_reviews = []
    base_url = 'https://filmarks.com/movies/' + str(movie_id)
    page_number = 1
    while True :
        # request GET
        if page_number == 1:
            page = requests.get(base_url)
        else:
            payload = {'page': '{}'.format(page_number)}
            page = requests.get(base_url, params=payload)
        individual_page_soup = BeautifulSoup(page.content, 'html.parser')
        
        reviews = get_reviews_from_individual_page_soup(individual_page_soup)
        if not reviews:
            break
        else:
            all_reviews.extend(reviews)
            #print(len(reviews))
            page_number += 1
    return all_reviews

In [24]:
def get_last_page_soup(movie_id):
    base_url = 'https://filmarks.com/movies/'
    page = requests.get(base_url + str(movie_id))
    soup = BeautifulSoup(page.content, 'html.parser')
    
    href = soup.find('a', class_="c-pagination__last").get("href")
    return get_page_soup_from_href(href)
# get_last_page_url(79340)

In [25]:
def get_previous_page(page_soup):
    base_url = 'https://filmarks.com/movies/'
    prev_page_loc = soup.find("a", class_="pagination__prev").get("href")
    if prev_page_loc is None:
        return False
    else:
        prev_page = requests.get(base_url + str(prev_page_loc))
        prev_soup = BeautifulSoup(page.content, 'html.parser')
        return prev_soup

In [None]:
def get_last_5_page_reviews(movie_id):
    last_page_url = get_last_page_url(movie_id)
    page = requests.get(last_page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    reviews = []
    for i in range(5):
        page_reviews = get_reviews_from_individual_page_soup(soup)
        if page_reviews is False:
            return reviews
        else:
            reviews.extend(page_reviews)
            # previous page を取得
            get_previous_page

In [33]:
def get_title_from_movie_id(movie_id):
    base_url = 'https://filmarks.com/movies/' + str(movie_id)
    page = requests.get(base_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find('h2', class_="p-content-detail__title").span.text
    return title

In [34]:
def write_json(title, filmarks_reviews):
    data = {}
    data["title"] = title
    reviews = {}
    reviews["filmarks"] = filmarks_reviews
    data["reviews"] = reviews
    with open('jsons/{}.json'.format(title), "w", encoding="utf-8") as f:
        json.dump(data, f, sort_keys=True, indent=4, separators=(',', ': '))

In [35]:
def main(movie_ids):
    for movie_id in movie_ids:
        title = get_title_from_movie_id(movie_id)
        print("title: {}".format(title))
        reviews = get_reviews_from_movie_id(movie_id)
        print("reveiws: {}".format(len(reviews)))
        write_json(title, reviews)

#main(all_movie_ids)

In [None]:
# write all_movie_ids as csv
import csv
#print(all_movie_ids)
with open('movie_ids.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(all_movie_ids)

In [20]:
# read movie_ids.csv
import csv
all_movie_ids = []
with open('movie_ids.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        all_movie_ids.extend(row)

## TEST