In [1]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import re
import json

In [2]:
def get_rating_from_soup(soup):
    rating = soup.find('div', class_="c-rating__score").text
    try:
        return float(rating)
    except:
        return None
# get_rating_from_soup(get_soup_from_movie_id(85659))

In [3]:
def get_genres_from_soup(soup):
    try:
        lis = soup.find('div', class_="p-content-detail__genre").ul
        genres = [a.text for a in lis]
        return genres
    except:
        return None
# get_genres_from_soup(get_soup_from_movie_id(82210))

In [4]:
def get_synopsis_from_soup(soup):
    try:
        p = soup.find('div', {"id": "js-content-detail-synopsis"}) \
            .find("content-detail-synopsis") \
            .get(":outline") \
            .strip(r'"')
        return p
    except:
        return None
# get_synopsis_from_soup(get_soup_from_movie_id(82210))

In [5]:
def get_soup_from_movie_id(movie_id):
    base_url = 'https://filmarks.com/movies/' + str(movie_id)
    page = requests.get(base_url)
    individual_page_soup = BeautifulSoup(page.content, 'html.parser')
    return individual_page_soup

In [6]:
def get_title_from_movie_id(movie_id):
    base_url = 'https://filmarks.com/movies/' + str(movie_id)
    page = requests.get(base_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.find('h2', class_="p-content-detail__title").span.text
    
    return trans(title)

In [7]:
def trans(title_string):
    """
    return regulated input (movie's title) 
    """
    table = str.maketrans({
        '１': '1',
        '２': '2',
        '３': '3',
        '４': '4',
        '５': '5',
        '６': '6',
        '７': '7',
        '８': '8',
        '９': '9',
        '０': '0',
        '＆': '&',
        '％': '%',
         '＝': '=',
        '＄': '$',
        '＃': '#',
        '！': '!',
        '？': '?'
    })
    regulated_title = re.sub(r'（[^（）]*）', '', title_string)
    regulated_title = re.sub(r'\([^\(\)]*\)', '', regulated_title)
    regulated_title = re.sub(r'[\s\-～〜:：;、。<>＜＞「」\"\',\.・/／－]+', ' ', regulated_title)
    regulated_title = regulated_title.translate(table)
    regulated_title = regulated_title.rstrip(' ')
    return regulated_title

In [8]:
def write_json(year, title, genre, rating, synopsis):
    data = {}
    data["title"] = title
    data = {}
    data["genre"] = genre
    data["rating"] = rating
    data["synopsis"] = synopsis
    with open('jsons/metadata/{}/{}.json'.format(year, title), "w", encoding="utf-8") as f:
        json.dump(data, f, sort_keys=True, indent=4, separators=(',', ': '))

In [None]:
def main(movie_ids, year):
    for movie_id in movie_ids:
        title = get_title_from_movie_id(movie_id)
        #print("title: {}".format(title))
        
        soup = get_soup_from_movie_id(movie_id)
        genre = get_genres_from_soup(soup)
        rating = get_rating_from_soup(soup)
        synopsis = get_synopsis_from_soup(soup)
        #print("genre: {}, rating: {}, synopsis: {}".format(genre, rating, synopsis))
        
        write_json(year, title, genre, rating, synopsis)
#main(all_movie_ids)

In [None]:
# read movie_ids.csv
import csv
for year in range(2008, 2013):
    movie_ids = []
    with open('movie_ids_{}.csv'.format(year), 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        for row in reader:
            movie_ids.extend(row)
    main(movie_ids, year)

## TEST