In [30]:
import requests
import json
import time
import pandas as pd

from tqdm import tqdm
from datetime import datetime

def clean_date(date_str: str) -> datetime:
    date_time_obj = datetime.strptime(date_str.replace('Z', '+00:00'), "%Y-%m-%dT%H:%M:%S%z")
    return date_time_obj

def query_url(page: int, API_KEY: str) -> str:
    URL = f'https://content.guardianapis.com/search?from-date=2003-12-31&to-date=2023-12-31&order-by=newest&page={page}&page-size=200&q=Crude%20AND%20Oil&api-key={API_KEY}'
    return URL

# retrieve API KEY from file
with open('API_KEY.txt', 'r') as file:
    API_KEY = file.read()


## Data retrieval - The Guardian API

In [35]:
archive = []

first_page = requests.get(query_url(1, API_KEY)).json()
pages = first_page['response']['pages']
archive.extend(first_page['response']['results'])

for page in tqdm(range(2, pages + 1)):
    URL = query_url(page, API_KEY)
    response = requests.get(URL).json()

    if response['response']['status'] != 'ok':
        print(f'Error at page {page}')
        continue
        
    article_list = response['response']['results']

    archive.extend(article_list)

df = pd.DataFrame(archive)

# add datetime column
df['datetime'] = df['webPublicationDate'].apply(clean_date)


100%|██████████| 35/35 [00:29<00:00,  1.21it/s]


### Save dataset to folder

In [36]:
df.to_csv('data/complete_data.csv', index=False)