In [None]:
import pandas as pd
import sys
import requests
import yaml

from tqdm.notebook import tqdm, tnrange
from datetime import datetime

# Import utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *


### Functions

In [None]:
def clean_date(date_str: str) -> datetime:
    date_time_obj = datetime.strptime(date_str.replace('Z', '+00:00'), "%Y-%m-%dT%H:%M:%S%z")
    return date_time_obj

def query_url(page: int, API_KEY: str) -> str:
    URL = f'https://content.guardianapis.com/search?from-date=2003-12-31&to-date=2023-12-31&order-by=newest&page={page}&page-size=200&q=Crude%20AND%20Oil&api-key={API_KEY}'
    return URL


### Data retrieval - The Guardian API

In [None]:
# retrieve API KEY from file
with open('API_keys/TG_API_KEY.txt', 'r') as file:
    API_KEY = file.read()

archive = []

first_page = requests.get(query_url(1, API_KEY)).json()
pages = first_page['response']['pages']
archive.extend(first_page['response']['results'])

for page in tnrange(2, pages + 1):
    URL = query_url(page, API_KEY)
    response = requests.get(URL).json()

    if response['response']['status'] != 'ok':
        print(f'Error at page {page}')
        continue
        
    article_list = response['response']['results']

    archive.extend(article_list)

df = pd.DataFrame(archive)

# add datetime column
df['datetime'] = df['webPublicationDate'].apply(clean_date)

df.drop(columns=['type', 'sectionId', 'sectionName', 'webPublicationDate', 'apiUrl', 'isHosted', 'pillarId', 'pillarName'], inplace=True)

df.rename(columns={'id': 'article_id', 'webTitle': 'headline', 'webUrl': 'web_url'}, inplace=True)

df = df[['article_id', 'headline', 'datetime', 'web_url']]


### Save dataset to folder

In [None]:
# Enter filename here:
file_name = 'TG_CrudeANDOil.csv'
# Enter relative path for saving the file:
relative_path = 'data/news'

df.to_csv(save_path(relative_path, file_name), index=False)