In [3]:
import os
import json
import requests

In [8]:
MOVIELENS_HOST = os.environ.get("MOVIELENS_HOST", "localhost")
MOVIELENS_SCHEMA = os.environ.get("MOVIELENS_SCHEMA", "http")
MOVIELENS_PORT = os.environ.get("MOVIELENS_PORT", "5000")

MOVIELENS_USER = os.environ.get("MOVIELENS_USER", "airflow")
MOVIELENS_PASSWORD = os.environ.get("MOVIELENS_PASSWORD", "airflow")

In [9]:
def _get_session():
    session = requests.Session()
    session.auth = (MOVIELENS_USER, MOVIELENS_PASSWORD)  # Replace with your credentials

    base_url = f"{MOVIELENS_SCHEMA}://{MOVIELENS_HOST}:{MOVIELENS_PORT}"

    return session, base_url

In [10]:
session, base_url = _get_session()

In [11]:
session

<requests.sessions.Session at 0x7eb0521ff830>

In [12]:
base_url

'http://localhost:5000'

In [18]:
def _get_with_pagination(session, url, params, batch_size=100):
    
    offset = 0
    total = None
    while total is None or offset < total:
        response = session.get(url,
                               params={
                                   **params,
                                   **{"offset": offset, "limit": batch_size}
                               })
        response.raise_for_status()
        response_json = response.json()

        yield from response_json["result"]

        offset += batch_size
        total = response_json["total"]

In [19]:
a1 = _get_with_pagination(session, f"{base_url}/ratings", {}, 10)
a1

<generator object _get_with_pagination at 0x7eb04bffc940>

In [20]:
for x in a1:
    print(x)

{'movieId': 196997, 'rating': 4.0, 'timestamp': 1546301166, 'userId': 27667}
{'movieId': 1213, 'rating': 5.0, 'timestamp': 1546301290, 'userId': 27667}
{'movieId': 55820, 'rating': 4.0, 'timestamp': 1546301706, 'userId': 27667}
{'movieId': 2329, 'rating': 4.0, 'timestamp': 1546301788, 'userId': 27667}
{'movieId': 4878, 'rating': 4.0, 'timestamp': 1546301842, 'userId': 27667}
{'movieId': 589, 'rating': 4.0, 'timestamp': 1546302155, 'userId': 128817}
{'movieId': 119145, 'rating': 2.0, 'timestamp': 1546302225, 'userId': 27667}
{'movieId': 2716, 'rating': 4.0, 'timestamp': 1546302513, 'userId': 27667}
{'movieId': 165, 'rating': 4.0, 'timestamp': 1546302548, 'userId': 27667}
{'movieId': 57669, 'rating': 4.0, 'timestamp': 1546302649, 'userId': 27667}
{'movieId': 5418, 'rating': 4.0, 'timestamp': 1546302687, 'userId': 27667}
{'movieId': 377, 'rating': 4.5, 'timestamp': 1546302763, 'userId': 27667}
{'movieId': 1387, 'rating': 3.5, 'timestamp': 1546303369, 'userId': 27667}
{'movieId': 70286, 'r

KeyboardInterrupt: 

In [40]:
def _get_with_pagination_test1(session, url, params, batch_size=100):

    response = session.get(url,
                            params={
                                **params,
                                **{"offset": 10, "limit": batch_size}
                            })
    response.raise_for_status()
    response_json = response.json()

    return response_json["result"]

In [41]:
a2 = _get_with_pagination_test1(session, f"{base_url}/ratings", {}, 100)
a2

[{'movieId': 5418, 'rating': 4.0, 'timestamp': 1546302687, 'userId': 27667},
 {'movieId': 377, 'rating': 4.5, 'timestamp': 1546302763, 'userId': 27667},
 {'movieId': 1387, 'rating': 3.5, 'timestamp': 1546303369, 'userId': 27667},
 {'movieId': 70286, 'rating': 4.0, 'timestamp': 1546303445, 'userId': 27667},
 {'movieId': 56782, 'rating': 3.0, 'timestamp': 1546303870, 'userId': 27667},
 {'movieId': 116797, 'rating': 4.0, 'timestamp': 1546303982, 'userId': 27667},
 {'movieId': 114662, 'rating': 5.0, 'timestamp': 1546304182, 'userId': 123850},
 {'movieId': 4710, 'rating': 3.5, 'timestamp': 1546305412, 'userId': 101489},
 {'movieId': 122906, 'rating': 2.0, 'timestamp': 1546306524, 'userId': 119993},
 {'movieId': 7346, 'rating': 4.0, 'timestamp': 1546307797, 'userId': 43168},
 {'movieId': 54286, 'rating': 4.5, 'timestamp': 1546307807, 'userId': 43168},
 {'movieId': 168266, 'rating': 5.0, 'timestamp': 1546308239, 'userId': 27931},
 {'movieId': 2858, 'rating': 4.0, 'timestamp': 1546308316, 'use

In [43]:
len(a2)

100

In [45]:
def _get_with_pagination_test2(session, url, params, batch_size=100):

    response = session.get(url,
                            params={
                                **params,
                                **{"offset": 10, "limit": batch_size}
                            })
    response.raise_for_status()
    response_json = response.json()

    yield response_json["result"]

In [46]:
a3 = _get_with_pagination_test2(session, f"{base_url}/ratings", {}, 100)
a3

<generator object _get_with_pagination_test2 at 0x7eb0490f0260>

In [47]:
next(a3)

[{'movieId': 5418, 'rating': 4.0, 'timestamp': 1546302687, 'userId': 27667},
 {'movieId': 377, 'rating': 4.5, 'timestamp': 1546302763, 'userId': 27667},
 {'movieId': 1387, 'rating': 3.5, 'timestamp': 1546303369, 'userId': 27667},
 {'movieId': 70286, 'rating': 4.0, 'timestamp': 1546303445, 'userId': 27667},
 {'movieId': 56782, 'rating': 3.0, 'timestamp': 1546303870, 'userId': 27667},
 {'movieId': 116797, 'rating': 4.0, 'timestamp': 1546303982, 'userId': 27667},
 {'movieId': 114662, 'rating': 5.0, 'timestamp': 1546304182, 'userId': 123850},
 {'movieId': 4710, 'rating': 3.5, 'timestamp': 1546305412, 'userId': 101489},
 {'movieId': 122906, 'rating': 2.0, 'timestamp': 1546306524, 'userId': 119993},
 {'movieId': 7346, 'rating': 4.0, 'timestamp': 1546307797, 'userId': 43168},
 {'movieId': 54286, 'rating': 4.5, 'timestamp': 1546307807, 'userId': 43168},
 {'movieId': 168266, 'rating': 5.0, 'timestamp': 1546308239, 'userId': 27931},
 {'movieId': 2858, 'rating': 4.0, 'timestamp': 1546308316, 'use

In [50]:
def _get_with_pagination_test3(session, url, params, batch_size=100):

    response = session.get(url,
                            params={
                                **params,
                                **{"offset": 10, "limit": batch_size}
                            })
    response.raise_for_status()
    response_json = response.json()

    yield from response_json["result"]

In [51]:
a4 = _get_with_pagination_test3(session, f"{base_url}/ratings", {}, 100)
a4

<generator object _get_with_pagination_test3 at 0x7eb0490f0040>

In [61]:
next(a4)

{'movieId': 7346, 'rating': 4.0, 'timestamp': 1546307797, 'userId': 43168}

In [62]:
next(a4)

{'movieId': 54286, 'rating': 4.5, 'timestamp': 1546307807, 'userId': 43168}