In [1]:
import requests

response = requests.get('https://api.github.com/repos/pytorch/issues', # pytorch가 path parameter이므로 
                                                                       # 별도의 parameter 설정 생략 가능
                        headers={'Accept' : 'application/vnd.github.text-match+json'})
print("Response Code", response.status_code)
print("Number of comments", len(response.json()))

Response Code 403
Number of comments 2


In [2]:
# Pagination
response.links

{}

In [3]:
# 재귀함수
# 파이썬에서 기본적으로 최대 1,000회 Recursion(재귀) 반복
# 설정을 통해 횟수 상한 증가 가능
# import sys
# sts.setrecursionlimit(10**5) # 10의 5제곱으로 반복 횟수 증가

def get_all_pages(url, params = None, headers = None):
    output_json = []
    response = requests.get(url, params = params, headers = headers)
    if response.status_code == 200:
        output_json = response.json()
        if 'next' in response.links:
            next_url = response.links['next']['url']
            if next_url is not None:
                output_json += get_all_pages(next_url, params, headers)
    return output_json

In [4]:
import pandas as pd

out = get_all_pages(
    "https://api.github.com/repos/pytorch/pytorch/issues/comments",
    params={
        'since': '2022-01-01T10:00:01Z',
        'sorted': 'created',
        'direction': 'desc'
    },
    headers = {"Accept": "application/vnd.github+json"}
)

df = pd.DataFrame(out)
print(df['body'].count())
df[['id', 'created_at', 'body']].sample(1)

KeyError: 'body'

In [5]:
# 시간당 속도 제한
response = requests.head("https://api.github.com/repos/pytorch/pytorch/issues/comments")
print('X-Ratelimit_Limit', response.headers['X-Ratelimit-Limit'])
print('X-Ratelimit_Remaining', response.headers['X-Ratelimit-Remaining'])

# UTC 시간을 사람이 읽을 수 잇는 형식으로 변환
import datetime
print('Rate Limit reset at', datetime.datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset'])).strftime('%c'))

X-Ratelimit_Limit 60
X-Ratelimit_Remaining 0
Rate Limit reset at Mon Feb 27 18:14:22 2023


In [6]:
# 분당 속도 제한

from datetime import datetime
import time

def handle_rate_limits(response):
    now = datetime.now()
    reset_time = datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset']))
    remaining_requests = response.headers['X-Ratelimit-Remaining']
    remaining_time = (reset_time - now).total_seconds()
    intervals = remaining_time / (1.0 + int(remaining_requests))
    
    print("Sleeping for", intervals)
    time.sleep(intevals)
    return True

In [7]:
# 네트워크 오류를 감안한 코드 작성
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

retry_strategy = Retry(
    total = 5, # 재시도 횟수
    status_forcelist = [500, 503, 504], # 재시도할 상태코드 목록
    backoff_factor = 1 # 각 시도마다 간격을 늘려주는 수치
    # time_delay = backoff_factor * (2**(총 재시도 횟수 - 1))
)

retry_adapter = HTTPAdapter(max_retries=retry_strategy)

http = requests.Session()
http.mount("https://", retry_adapter)
http.mount("https://", retry_adapter)

response = http.get("https://api.github.com/search/repositories",
                    params={'q':'data_science+language:python'})

for item in response.json()['items'][:5]:
    print(item['name'])

data-science-from-scratch
PySyft
data-science-blogs
galaxy
DataCamp
