In [1]:
import os
import pandas as pd

DATA_DIR = os.path.join('..', 'data', 'raw', 'reviews', '1pct_10pct')

In [2]:
genre_df = (pd.read_csv(os.path.join(DATA_DIR, file))
            for file in os.listdir(DATA_DIR))

reviews = pd.concat(genre_df, ignore_index=True)

In [3]:
reviews.head()

Unnamed: 0,id,text,rating,date,title,author,helpfulness
0,/title/tt0468569/,Best movie ever. Heath ledger's work is phenom...,10.0,12 January 2021,Perfect combo\n,/user/ur95396995/?ref_=tt_urv,\n 171 out of 185 found thi...
1,/title/tt0468569/,Totally one of the greatest movie titles ever ...,10.0,9 January 2021,The Dark Knight\n,/user/ur109215140/?ref_=tt_urv,\n 144 out of 158 found thi...
2,/title/tt0468569/,This movie is a work of art. The finest sequel...,10.0,17 February 2021,This town deserves a better class of criminal!\n,/user/ur129557514/?ref_=tt_urv,\n 50 out of 54 found this ...
3,/title/tt0468569/,"Confidently directed, dark, brooding, and pack...",10.0,12 February 2020,The Dark Knight\n,/user/ur87850731/?ref_=tt_urv,\n 404 out of 471 found thi...
4,/title/tt0468569/,It is just what you want for the best movie. G...,10.0,7 October 2019,MASTERPIECE\n,/user/ur108519953/?ref_=tt_urv,\n 217 out of 251 found thi...


In [49]:
class Foo:
    def __init__(self, x):
        self.x = x

    def __call__(self, x):
        return x + self.x

class Bar:
    def __init__(self):
        pass

    def __call__(self, x):
        return x + '__bar'


from copy import copy
from typing import Iterable, Callable


class Pipeline:
    def __init__(self, *steps: Iterable[str]):
        self._pipeline = []
        for step_num, step in enumerate(steps):
            if len(step) == 0:
                raise AttributeError(f'Step #{step_num + 1} is empty!')
            if len(step) != 2:
                raise AttributeError(
                    'Each step must be of length 2'
                    ' and match a form (<step_name>, <step_class>)'
                )
            self._pipeline.append({'step_name': step[0],
                                   'step_func': step[1]})

    @property
    def schema(self):
        print('Pipeline schema:')
        for num, step in enumerate(self._pipeline):
            name, func = step.values()
            print(f'{num + 1}. Name: {name:<5}',
                  f'Transformer: {func}', sep='\n   ')

    def compose(self, data):
        result = copy(data)
        for step in self._pipeline:
            step_func = step['step_func']
            result = step_func(result)
        return result

    def __call__(self):
        return 1

In [50]:
pipeline = Pipeline(('foo', Foo('__foo')), ('bar', Bar()))

In [51]:
pipeline.compose('string')

'string__foo__bar'

In [52]:
s = 'abc/cba/123/'
s

'abc/cba/123.jpeg'

In [53]:
s.split('/')

['abc', 'cba', '123.jpeg']

In [2]:
a = 123
a

123

In [5]:
s = 'abc/cba/123/'
s.split('/')[-2]

'123'

In [2]:
import pandas as pd
data_path = '../data/raw/details/1pct/action.csv'
action = pd.read_csv(data_path)
action.head()

Unnamed: 0,original_title,review_summary,agg_rating,actors,imdb_recommendations,storyline,tagline,certificate,details,boxoffice,techspecs
0,Original title: The Dark Knight,"{'n_user_reviews': '7.7KUser reviews', 'n_crit...",9.0/102.5M,{'Christian Bale': '/name/nm0000288?ref_=tt_cl...,"['/title/tt1345836/?ref_=tt_sims_tt_t_1', '/ti...",Set within a year after the events of Batman B...,TaglinesWhy So Serious?,Certificate14+,"Release dateAugust 14, 2008 (Russia)Countries ...","Budget$185,000,000 (estimated)Gross US & Canad...",Runtime2 hours 32 minutesSound mixDolby Digita...
1,Original title: Inception,"{'n_user_reviews': '4.4KUser reviews', 'n_crit...",8.8/102.2M,{'Leonardo DiCaprio': '/name/nm0000138?ref_=tt...,"['/title/tt0816692/?ref_=tt_sims_tt_t_1', '/ti...","Dom Cobb is a skilled thief, the absolute best...",TaglinesYour mind is the scene of the crime,Certificate12+,"Release dateJuly 22, 2010 (Russia)Countries of...","Budget$160,000,000 (estimated)Gross US & Canad...",Runtime2 hours 28 minutesColorColorSound mixDo...
2,Original title: The Matrix,"{'n_user_reviews': '4.6KUser reviews', 'n_crit...",8.7/101.8M,{'Keanu Reeves': '/name/nm0000206?ref_=tt_cl_t...,"['/title/tt1375666/?ref_=tt_sims_tt_t_1', '/ti...",Thomas A. Anderson is a man living two lives. ...,TaglinesFree your mind,Certificate16+,"Release dateOctober 14, 1999 (Russia)Countries...","Budget$63,000,000 (estimated)Gross US & Canada...",Runtime2 hours 16 minutesColorColorSound mixDo...
3,Original title: The Lord of the Rings: The Fel...,"{'n_user_reviews': '5.5KUser reviews', 'n_crit...",8.8/101.7M,{'Elijah Wood': '/name/nm0000704?ref_=tt_cl_t_...,"['/title/tt0167261/?ref_=tt_sims_tt_t_1', '/ti...",An ancient Ring thought lost for centuries has...,TaglinesThe Legend Comes to Life,Certificate12+,"Release dateMarch 1, 2002 (Russia)Countries of...","Budget$93,000,000 (estimated)Gross US & Canada...",Runtime2 hours 58 minutesColorColorSound mixDT...
4,Original title: The Lord of the Rings: The Ret...,"{'n_user_reviews': '3.9KUser reviews', 'n_crit...",8.9/101.7M,{'Elijah Wood': '/name/nm0000704?ref_=tt_cl_t_...,"['/title/tt0167261/?ref_=tt_sims_tt_t_1', '/ti...",The final confrontation between the forces of ...,TaglinesThe eye of the enemy is moving.,Certificate12+,"Release dateJanuary 22, 2004 (Russia)Countries...","Budget$94,000,000 (estimated)Gross US & Canada...",Runtime3 hours 21 minutesColorColorSound mixDT...


In [20]:
import numpy as np


def expand_short_form(string_num: str) -> int:
    short_forms = {
        'K': 1_000,
        'M': 1_000_000,
        'B': 1_000_000_000,
        'T': 1_000_000_000_000
    }
    last_char = string_num[-1]
    if last_char not in short_forms.keys():
        return float(string_num)
    return float(string_num[:-1]) * short_forms.get(last_char, None)


def split_aggregate_rating_col(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Split column 'agg_rating' of type string into two columns:
    'movie_rating' of type float and 'movie_total_votes' of type int.
    After transformation the 'agg_rating' column is removed.
    """
    if 'agg_rating' not in df_raw.columns:
        raise ValueError('No "agg_rating" column in input data')

    df_ = df_raw.copy(deep=False)
    df_[['rating', 'total_votes']] = (
        df_['agg_rating']
        .str.split('/10', expand=True)
        .values
    )
    df_['total_votes'] = df_['total_votes'].apply(expand_short_form)
    return (
        df_
        .astype({'rating': np.float32, 'total_votes': np.int32})
        .drop('agg_rating', axis=1)
    )

split_aggregate_rating_col(action)

Unnamed: 0,original_title,review_summary,actors,imdb_recommendations,storyline,tagline,certificate,details,boxoffice,techspecs,rating,total_votes
0,Original title: The Dark Knight,"{'n_user_reviews': '7.7KUser reviews', 'n_crit...",{'Christian Bale': '/name/nm0000288?ref_=tt_cl...,"['/title/tt1345836/?ref_=tt_sims_tt_t_1', '/ti...",Set within a year after the events of Batman B...,TaglinesWhy So Serious?,Certificate14+,"Release dateAugust 14, 2008 (Russia)Countries ...","Budget$185,000,000 (estimated)Gross US & Canad...",Runtime2 hours 32 minutesSound mixDolby Digita...,9.0,2500000
1,Original title: Inception,"{'n_user_reviews': '4.4KUser reviews', 'n_crit...",{'Leonardo DiCaprio': '/name/nm0000138?ref_=tt...,"['/title/tt0816692/?ref_=tt_sims_tt_t_1', '/ti...","Dom Cobb is a skilled thief, the absolute best...",TaglinesYour mind is the scene of the crime,Certificate12+,"Release dateJuly 22, 2010 (Russia)Countries of...","Budget$160,000,000 (estimated)Gross US & Canad...",Runtime2 hours 28 minutesColorColorSound mixDo...,8.8,2200000
2,Original title: The Matrix,"{'n_user_reviews': '4.6KUser reviews', 'n_crit...",{'Keanu Reeves': '/name/nm0000206?ref_=tt_cl_t...,"['/title/tt1375666/?ref_=tt_sims_tt_t_1', '/ti...",Thomas A. Anderson is a man living two lives. ...,TaglinesFree your mind,Certificate16+,"Release dateOctober 14, 1999 (Russia)Countries...","Budget$63,000,000 (estimated)Gross US & Canada...",Runtime2 hours 16 minutesColorColorSound mixDo...,8.7,1800000
3,Original title: The Lord of the Rings: The Fel...,"{'n_user_reviews': '5.5KUser reviews', 'n_crit...",{'Elijah Wood': '/name/nm0000704?ref_=tt_cl_t_...,"['/title/tt0167261/?ref_=tt_sims_tt_t_1', '/ti...",An ancient Ring thought lost for centuries has...,TaglinesThe Legend Comes to Life,Certificate12+,"Release dateMarch 1, 2002 (Russia)Countries of...","Budget$93,000,000 (estimated)Gross US & Canada...",Runtime2 hours 58 minutesColorColorSound mixDT...,8.8,1700000
4,Original title: The Lord of the Rings: The Ret...,"{'n_user_reviews': '3.9KUser reviews', 'n_crit...",{'Elijah Wood': '/name/nm0000704?ref_=tt_cl_t_...,"['/title/tt0167261/?ref_=tt_sims_tt_t_1', '/ti...",The final confrontation between the forces of ...,TaglinesThe eye of the enemy is moving.,Certificate12+,"Release dateJanuary 22, 2004 (Russia)Countries...","Budget$94,000,000 (estimated)Gross US & Canada...",Runtime3 hours 21 minutesColorColorSound mixDT...,8.9,1700000
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Original title: Godzilla: King of the Monsters,"{'n_user_reviews': '2.3KUser reviews', 'n_crit...",{'Kyle Chandler': '/name/nm0151419?ref_=tt_cl_...,"['/title/tt0831387/?ref_=tt_sims_tt_t_1', '/ti...",The new story follows the heroic efforts of th...,TaglinesLet them fight!,Certificate16+,"Release dateMay 30, 2019 (Russia)Countries of ...","Budget$170,000,000 (estimated)Gross US & Canad...",Runtime2 hours 12 minutesColorColorSound mixDo...,6.0,170000
496,Original title: Inferno,"{'n_user_reviews': '448User reviews', 'n_criti...",{'Tom Hanks': '/name/nm0000158?ref_=tt_cl_t_1'...,"['/title/tt0808151/?ref_=tt_sims_tt_t_1', '/ti...",Famous symbologist on a trail of clues tied to...,TaglinesEvery clue will take him deeper,Motion Picture Rating (MPAA)Rated PG-13 for se...,"Release dateOctober 13, 2016 (Russia)Countries...","Budget$75,000,000 (estimated)Gross US & Canada...",Runtime2 hours 1 minuteColorColorSound mixDolb...,6.2,169000
497,Original title: Superman,"{'n_user_reviews': '609User reviews', 'n_criti...",{'Christopher Reeve': '/name/nm0001659?ref_=tt...,"['/title/tt0081573/?ref_=tt_sims_tt_t_1', '/ti...",Just before the destruction of the planet Kryp...,TaglinesYou'll believe a man can fly.,Certificate0+,"Release dateDecember 14, 1978 (United Kingdom)...","Budget$55,000,000 (estimated)Gross US & Canada...",Runtime2 hours 23 minutesColorColorSound mixDo...,7.3,168000
498,Original title: Demolition Man,"{'n_user_reviews': '309User reviews', 'n_criti...",{'Sylvester Stallone': '/name/nm0000230?ref_=t...,"['/title/tt0098439/?ref_=tt_sims_tt_t_1', '/ti...","Frozen in 1996, Simon Phoenix, a convicted cri...",TaglinesThe future isn't big enough for the bo...,Certificate16+,"Release dateOctober 8, 1993 (United States)Cou...","Budget$57,000,000 (estimated)Gross US & Canada...",Runtime1 hour 55 minutesColorColorAspect ratio...,6.7,168000


In [16]:
(
action['agg_rating']
.str.split('/10', expand=True)
.apply(lambda x: (x[0], expand_short_form(x[1])), axis=1)
)

0      (9.0, 2500000.0)
1      (8.8, 2200000.0)
2      (8.7, 1800000.0)
3      (8.8, 1700000.0)
4      (8.9, 1700000.0)
             ...       
495     (6.0, 170000.0)
496     (6.2, 169000.0)
497     (7.3, 168000.0)
498     (6.7, 168000.0)
499     (6.6, 168000.0)
Length: 500, dtype: object