In [1]:
import os
import pandas as pd

DATA_DIR = os.path.join('..', 'data', 'raw', 'reviews', '1pct_10pct')

In [2]:
genre_df = (pd.read_csv(os.path.join(DATA_DIR, file))
            for file in os.listdir(DATA_DIR))

reviews = pd.concat(genre_df, ignore_index=True)

In [3]:
reviews.head()

Unnamed: 0,id,text,rating,date,title,author,helpfulness
0,/title/tt0468569/,Best movie ever. Heath ledger's work is phenom...,10.0,12 January 2021,Perfect combo\n,/user/ur95396995/?ref_=tt_urv,\n 171 out of 185 found thi...
1,/title/tt0468569/,Totally one of the greatest movie titles ever ...,10.0,9 January 2021,The Dark Knight\n,/user/ur109215140/?ref_=tt_urv,\n 144 out of 158 found thi...
2,/title/tt0468569/,This movie is a work of art. The finest sequel...,10.0,17 February 2021,This town deserves a better class of criminal!\n,/user/ur129557514/?ref_=tt_urv,\n 50 out of 54 found this ...
3,/title/tt0468569/,"Confidently directed, dark, brooding, and pack...",10.0,12 February 2020,The Dark Knight\n,/user/ur87850731/?ref_=tt_urv,\n 404 out of 471 found thi...
4,/title/tt0468569/,It is just what you want for the best movie. G...,10.0,7 October 2019,MASTERPIECE\n,/user/ur108519953/?ref_=tt_urv,\n 217 out of 251 found thi...


In [49]:
class Foo:
    def __init__(self, x):
        self.x = x

    def __call__(self, x):
        return x + self.x

class Bar:
    def __init__(self):
        pass

    def __call__(self, x):
        return x + '__bar'


from copy import copy
from typing import Iterable, Callable


class Pipeline:
    def __init__(self, *steps: Iterable[str]):
        self._pipeline = []
        for step_num, step in enumerate(steps):
            if len(step) == 0:
                raise AttributeError(f'Step #{step_num + 1} is empty!')
            if len(step) != 2:
                raise AttributeError(
                    'Each step must be of length 2'
                    ' and match a form (<step_name>, <step_class>)'
                )
            self._pipeline.append({'step_name': step[0],
                                   'step_func': step[1]})

    @property
    def schema(self):
        print('Pipeline schema:')
        for num, step in enumerate(self._pipeline):
            name, func = step.values()
            print(f'{num + 1}. Name: {name:<5}',
                  f'Transformer: {func}', sep='\n   ')

    def compose(self, data):
        result = copy(data)
        for step in self._pipeline:
            step_func = step['step_func']
            result = step_func(result)
        return result

    def __call__(self):
        return 1

In [50]:
pipeline = Pipeline(('foo', Foo('__foo')), ('bar', Bar()))

In [51]:
pipeline.compose('string')

'string__foo__bar'