In [131]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import re

In [132]:
def get_movie_ids_from_list_page_soup(list_page_soup):
    """
    get movie_id from list_page_soup
    """
    movies = list_page_soup.find_all('div', class_="p-movie-cassette js-movie-cassette")
    
    if not movies:
        return False
    movie_ids = []
    
    for i in range(len(movies)):
        data_clip = movies[i].get('data-clip')
        movie_id = int(re.match(r'^.*"movie_id":(\d{5}).*', data_clip).group(1))
        movie_ids.append(movie_id)
        
    return movie_ids

In [133]:
def get_all_movie_ids(base_url):
    """
    get all movie ids from base url 
    (ex. https://filmarks.com/list/year/2010s/2018)
    """
    all_movie_ids = []
    page_number = 1
    while True :
        # request GET
        if page_number == 1:
            page = requests.get(base_url)
        else:
            payload = {'page': '{}'.format(page_number)}
            page = requests.get(base_url, params=payload)
        list_page_soup = BeautifulSoup(page.content, 'html.parser')
        
        movie_ids = get_movie_ids_from_list_page_soup(list_page_soup)
        if not movie_ids:
            break
        else:
            all_movie_ids.extend(movie_ids)
            page_number += 1
    return all_movie_ids

In [134]:
def get_reviews_from_individual_page_soup(individual_page_soup):
    """
    parse indivisual_page_soup and return reviews's list
    """
    reviews_obj = individual_page_soup.find_all('div', class_="p-mark__review")
    if not reviews_obj:
        return False
    else:
        return [reviews_obj[i].text for i in range(len(reviews_obj))]

In [135]:
def get_reviews_from_movie_id(movie_id):
    """
    get all reviews tied to a given movie_id
    and return a list of them
    """
    all_reviews = []
    base_url = 'https://filmarks.com/movies/' + str(movie_id)
    page_number = 1
    while True :
        # request GET
        if page_number == 1:
            page = requests.get(base_url)
        else:
            payload = {'page': '{}'.format(page_number)}
            page = requests.get(base_url, params=payload)
        individual_page_soup = BeautifulSoup(page.content, 'html.parser')
        
        reviews = get_reviews_from_individual_page_soup(individual_page_soup)
        print(reviews)
        if not reviews:
            break
        else:
            all_reviews.extend(reviews)
            page_number += 1
    return all_reviews

In [None]:
import csv 

with open('movie_ids.csv', 'w', encoding='utf-8') as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerows(get_all_movie_ids('https://filmarks.com/list/year/2010s/2018'))