In [9]:
import os
import pandas as pd

DATA_DIR = os.path.join('..', 'data', 'raw', 'reviews', '1pct_10pct')

In [11]:
genre_df = (pd.read_csv(os.path.join(DATA_DIR, file))
            for file in os.listdir(DATA_DIR))

reviews = pd.concat(genre_df, ignore_index=True)

In [16]:
reviews.head()

Unnamed: 0,id,text,rating,date,title,author,helpfulness
0,/title/tt0468569/,Best movie ever. Heath ledger's work is phenom...,10.0,12 January 2021,Perfect combo\n,/user/ur95396995/?ref_=tt_urv,\n 171 out of 185 found thi...
1,/title/tt0468569/,Totally one of the greatest movie titles ever ...,10.0,9 January 2021,The Dark Knight\n,/user/ur109215140/?ref_=tt_urv,\n 144 out of 158 found thi...
2,/title/tt0468569/,This movie is a work of art. The finest sequel...,10.0,17 February 2021,This town deserves a better class of criminal!\n,/user/ur129557514/?ref_=tt_urv,\n 50 out of 54 found this ...
3,/title/tt0468569/,"Confidently directed, dark, brooding, and pack...",10.0,12 February 2020,The Dark Knight\n,/user/ur87850731/?ref_=tt_urv,\n 404 out of 471 found thi...
4,/title/tt0468569/,It is just what you want for the best movie. G...,10.0,7 October 2019,MASTERPIECE\n,/user/ur108519953/?ref_=tt_urv,\n 217 out of 251 found thi...


In [18]:
def split_helpfulness_col(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Split 'helpfulness' column of input dataframe into two
    distinct columns: 'upvotes' and 'total_votes'.

    Args:
        df_raw (pd.DataFrame): input data with column 

    Returns:
        pd.DataFrame: Output data frame 
    """
    df_ = df_raw.copy(deep=False)
    df_[['upvotes', 'total_votes']] = (
        reviews['helpfulness']
        .str.replace(',', '')
        .str.extractall('(\d+)')
        .unstack('match')
        .values
    )
    return df_.drop(columns=['helpfulness'])
    

156531

In [46]:
result = (
    reviews['helpfulness']
    .str.replace(',', '')
    .str.extractall('(\d+)')
    .unstack('match')
)
result

Unnamed: 0_level_0,0,0
match,0,1
0,171,185
1,144,158
2,50,54
3,404,471
4,217,251
...,...,...
604280,8,12
604281,12,20
604282,4,5
604283,4,5


In [52]:
data = [
    {'id': 1, 'helpfulness': '10 out of 12'},
    {'id': 2, 'helpfulness': '100 out of 101'}
]
pd.DataFrame(data)

Unnamed: 0,id,helpfulness
0,1,10 out of 12
1,2,100 out of 101


In [54]:
reviews['helpfulness'][0]

'\n                    171 out of 185 found this helpful.\n                        \n                            Was this review helpful?  Sign in to vote.\n                        \n\nPermalink\n'

In [62]:
reviews.loc[0, ['id', 'rating']]

id        /title/tt0468569/
rating                 10.0
Name: 0, dtype: object

In [67]:
dates = pd.to_datetime(reviews['date'])
dates

0        2021-01-12
1        2021-01-09
2        2021-02-17
3        2020-02-12
4        2019-10-07
            ...    
604280   2010-02-04
604281   2000-03-06
604282   2020-03-08
604283   2019-09-24
604284   2011-01-09
Name: date, Length: 604285, dtype: datetime64[ns]

In [70]:
dates[0].date()

datetime.date(2021, 1, 12)

In [71]:
from datetime import datetime

In [77]:
datetime(2021, 1, 12).date() == dates[0].date()

True

In [74]:
dates[0].date()

datetime.date(2021, 1, 12)

In [76]:
datetime(2021, 1, 12).date()

datetime.date(2021, 1, 12)

In [78]:
from recsys.utils import dump_obj

In [79]:
import requests

In [86]:
s = requests.get('https://www.imdb.com/title/tt7991608/?ref_=adv_li_tt')

In [87]:
from bs4 import BeautifulSoup

In [89]:
soup = BeautifulSoup(s.text, 'lxml')

In [95]:
type(soup)

bs4.BeautifulSoup

In [150]:
import re
from typing import Optional, Dict, Any
from recsys.utils import send_request

BASE_URL = 'https://www.imdb.com{}'

def collect_original_title(soup: BeautifulSoup) -> Optional[str]:
    try:
        return (
            soup
            .find('div', {'data-testid': 'hero-title-block__original-title'})
            .text
        )
    except Exception:
        return None


def collect_poster(soup: BeautifulSoup) -> Optional[bytes]:
    try:
        img_id = (
            soup
            .find('a', {'aria-label': 'View {Title} Poster'})
            .get('href')
            .split('?')[0]
        )
        inter_response = send_request(BASE_URL.format(img_id))
        img_download_link = (
            BeautifulSoup(inter_response.text, 'lxml')
            .find('img')['src']
        )
        response = send_request(img_download_link)
        return response.content
    except Exception:
        return None


def collect_review_content(soup: BeautifulSoup)\
        -> Optional[Dict[str, Any]]:
    review_content_raw = (
        soup
        .find('ul', {'data-testid': 'reviewContent-all-reviews'})
        .find_all('span', {'class', 'three-Elements'})
    )

    try:
        user_reviews = review_content_raw[0].text
    except Exception:
        user_reviews = None

    try:
        critic_reviews = review_content_raw[1].text
    except Exception:
        critic_reviews = None

    try:
        metascore = review_content_raw[2].text
    except Exception:
        metascore = None

    return {
        'n_user_reviews': user_reviews,
        'n_critic_reviews': critic_reviews,
        'metascore': metascore,
    }

def collect_aggregate_rating(soup: BeautifulSoup) -> Optional[str]:
    try:
        regexp = re.compile('AggregateRatingButton')
        return (
            soup
            .find('div', {'class', regexp})
            .text
        )
    except Exception:
        return None


details = {
    'original_title': collect_original_title,
    'poster': collect_poster,
    'review_content': collect_review_content
}

In [194]:
from recsys.imdb_parser.details import DetailsCollector

In [101]:
from PIL import Image
import io

image = Image.open(io.BytesIO(img.content))
image.show()

In [208]:
soup.find('li', {'data-testid': 'storyline-certificate'}).text

'Certificate12+'

In [217]:
sections = [
    'Release date',
    'Country of origin',
    'Official site',
    'Languages',
    'Also known as',
    'Filming locations',
    'Production companies'
]
details = soup.find('div', {'data-testid': 'title-details-section'}).text
details.split(sections[1])

['Release dateNovember 12, 2021 (Russia)',
 'United StatesOfficial siteOfficial NetflixLanguagesEnglishRussianItalianSpanishIndonesianAlso known asRed NoticeFilming locationsRome, ItalyProduction companiesNetflixSeven Bucks ProductionsFlynn Picture CompanySee more company credits at IMDbPro']

In [218]:
from typing import List

sections = [
    'Release date',
    'Country of origin',
    'Official site',
    'Languages',
    'Also known as',
    'Filming locations',
    'Production companies'
]

def extract_substrings_between_sections(details_str: str, sections: List[str])\
    -> Optional[Dict[str, str]]:
    details = {}
    for section_num in range(len(sections) - 1):
        start = sections[section_num]
        end = sections[section_num + 1]
        left_loc = details_str.find(start)
        right_loc = details_str.rfind(end)
        details[start] = details_str[left_loc + len(start): right_loc]
    return details

{'Release date': 'November 12, 2021 (Russia)',
 'Country of origin': 'United States',
 'Official site': 'Official Netflix',
 'Languages': 'EnglishRussianItalianSpanishIndonesian',
 'Also known as': 'Red Notice',
 'Filming locations': 'Rome, Italy'}

In [248]:
def extract_substrings_after_anchors(details_str: str, anchors: List[str])\
    -> Optional[Dict[str, str]]:
    details = {}
    empty_anchords = [anchor for anchor in anchors if anchor not in details_str]
    use_anchors = [anchor for anchor in anchors if anchor not in empty_anchords]
    for section_num in range(len(use_anchors)):
        start = use_anchors[section_num]
        left_loc = details_str.find(start)
        if section_num != len(use_anchors) - 1:
            end = use_anchors[section_num + 1]
            right_loc = details_str.rfind(end)
            details[start] = details_str[left_loc + len(start): right_loc]
        else:
            details[start] = details_str[left_loc + len(start):]
    details.update(**dict.fromkeys(empty_anchords))
    return details


def test_extract_substrings_after_anchors():
    s = 'anchor1 some string anchor2 another string'
    anchors = ['anchor1', 'anchor2', 'anchor3', 'anchor4']
    substrings = extract_substrings_after_anchors(s, anchors)
    print(substrings)
    # assert all(anchor in substrings.keys() for anchor in anchors)


test_extract_substrings_after_anchors()

{'anchor1': ' some string ', 'anchor2': ' another string'}
{'anchor1': ' some string ', 'anchor2': ' another string', 'anchor3': None, 'anchor4': None}
{'anchor1': ' some string ', 'anchor2': ' another string', 'anchor3': None, 'anchor4': None}
