In [5]:
import requests
from bs4 import BeautifulSoup

TEST_ID = '/title/tt13433812/'
start_url = f'https://www.imdb.com{TEST_ID}reviews?ref_=tt_urv'
link = f'https://www.imdb.com{TEST_ID}reviews/_ajax'

params = {
    'ref_': 'undefined',
    'paginationKey': ''
}

with requests.Session() as s:
    s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
    res = s.get(start_url)

    title_reviews = []
    while True:
        soup = BeautifulSoup(res.text, 'lxml')
        for container in soup.select('.review-container'):
            review = {
                'movie_id': TEST_ID,
                'text': extract_text(container),
                'rating': extract_rating(container),
                'date': extract_date(container),
                'title': extract_title(container),
                'author': extract_author(container),
                'helpfulness': extract_helpfulness(container)
            }
            title_reviews.append(review)

        try:
            pagination_key = (
                soup
                .select_one(".load-more-data[data-key]")
                .get("data-key")
            )
        except AttributeError:
            break

        params['paginationKey'] = pagination_key
        res = s.get(link,params=params)

In [15]:
# from recsys.utils import write_csv, read_csv
import csv

def write_csv(data, path: str,
              fieldnames: str = 'infer', mode: str = 'a',
              encoding: str = 'utf8') -> None:
    with open(path, mode, newline='', encoding=encoding) as file:
        if fieldnames == 'infer':
            fieldnames_ = list(data[0].keys())
        else:
            fieldnames_ = fieldnames
        writer = csv.DictWriter(file, fieldnames_)
        writer.writeheader()
        writer.writerows(data)


def read_csv(path, encoding: str = 'utf8'):
    with open(path, 'r', newline='', encoding=encoding) as file:
        return list(csv.DictReader(file))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
test_path = '../data/raw/title_reviews/test_reviews.csv'
columns = [
    'movie_id',
    'text',
    'rating',
    'date',
    'title',
    'author',
    'helpfulness'
]
write_csv(title_reviews, test_path, fieldnames=columns)

In [23]:
import pandas as pd

test_data = pd.read_csv(test_path)

In [4]:
def extract_text(container: BeautifulSoup) -> str:
    try:
        text_raw = container.find('div', {'class': 'text show-more__control'})
        return text_raw.text
    except:
        return None


def extract_rating(container: BeautifulSoup) -> int:
    try:
        rating_raw = container.find_all('span')
        rating = rating_raw[1].text
        # If no rating was given, span block containes review date
        if len(rating) > 2:
            return None
        return int(rating)
    except:
        return None


def extract_date(container: BeautifulSoup) -> str:
    try:
        date_raw = container.find('span', {'class': 'review-date'})
        return date_raw.text
    except:
        return None

def extract_title(container: BeautifulSoup) -> str:
    try:
        title_raw = container.find('a', {'class': 'title'})
        return title_raw.text
    except:
        return None

def extract_author(container: BeautifulSoup) -> str:
    try:
        author_raw = container.find('span', {'class': 'display-name-link'})
        return author_raw.a['href']
    except:
        return None

def extract_helpfulness(container: BeautifulSoup) -> str:
    try:
        helpfulness_raw = container.find('div', {'class': 'actions text-muted'})
        return helpfulness_raw.text
    except:
        return None

In [23]:
review = {
    'movie_id': 'some_id',
    'text': extract_text(some_rev),
    'rating': extract_rating(some_rev),
    'date': extract_date(some_rev),
    'title': extract_title(some_rev),
    'author': extract_author(some_rev),
    'helpfulness': extract_helpfulness(some_rev)
}

In [25]:
from recsys.utils import load_obj

In [31]:
load_obj('../data/raw/identifiers/ACTION__50')

['/title/tt0468569/',
 '/title/tt1375666/',
 '/title/tt0944947/',
 '/title/tt0133093/',
 '/title/tt0120737/',
 '/title/tt0167260/',
 '/title/tt1345836/',
 '/title/tt0167261/',
 '/title/tt0172495/',
 '/title/tt0372784/',
 '/title/tt0848228/',
 '/title/tt0076759/',
 '/title/tt0407887/',
 '/title/tt0080684/',
 '/title/tt0499549/',
 '/title/tt2015381/',
 '/title/tt0110413/',
 '/title/tt0434409/',
 '/title/tt0266697/',
 '/title/tt0325980/',
 '/title/tt0103064/',
 '/title/tt0086190/',
 '/title/tt0371746/',
 '/title/tt4154796/',
 '/title/tt1431045/',
 '/title/tt4154756/',
 '/title/tt1392190/',
 '/title/tt0082971/',
 '/title/tt0107290/',
 '/title/tt2488496/',
 '/title/tt1392170/',
 '/title/tt0095016/',
 '/title/tt0088247/',
 '/title/tt2395427/',
 '/title/tt1300854/',
 '/title/tt1454468/',
 '/title/tt0800369/',
 '/title/tt0458339/',
 '/title/tt1843866/',
 '/title/tt1228705/',
 '/title/tt0416449/',
 '/title/tt0120915/',
 '/title/tt1663202/',
 '/title/tt0121766/',
 '/title/tt0770828/',
 '/title/t

In [30]:
import os
os.listdir('../data/raw/identifiers')

['.gitkeep',
 'ACTION__50',
 'ADVENTURE__50',
 'ANIMATION__50',
 'BIOGRAPHY__50',
 'COMEDY__50',
 'CRIME__50',
 'DOCUMENTARY__50',
 'DRAMA__50',
 'FAMILY__50',
 'FANTASY__50',
 'FILM_NOIR__50',
 'HISTORY__50',
 'HORROR__50',
 'MUSICAL__50',
 'MUSIC__50',
 'MYSTERY__50',
 'ROMANCE__50',
 'SCI_FI__50',
 'SHORT__50',
 'SPORT__50',
 'THRILLER__50',
 'WAR__50',
 'WESTERN__50']