In [None]:
import pandas as pd
import sys
from pynytimes import NYTAPI
import datetime as dt

from tqdm import tqdm

# Import utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *


### NY Times Developer Platform API


**API Call limit:** API: 500 requests per day and 5 requests per minute. You should sleep 12 seconds between calls to avoid hitting the per minute rate limit. 

**Pagination:** The Article Search API returns a max of 10 results at a time. The meta node in the response contains the total number of matches ("hits") and the current offset. Use the page query parameter to paginate thru results (page=0 for results 1-10, page=1 for 11-20, ...). You can paginate thru up to 100 pages (1,000 results). If you get too many results try filtering by date range.

In [None]:
with open('NYT_API_KEY.txt', 'r') as file:
    API_KEY = file.read()

nyt = NYTAPI(API_KEY, parse_dates=True)


start_date = dt.datetime(2003, 12, 31)
stop_date = dt.datetime(2009, 6, 11)

query_string = 'Crude Oil'

articles = nyt.article_search(
    query=query_string,
    results=10000,
    dates={
        'begin': start_date, 
        'end': stop_date
    },
    options={
        'sort': 'newest'
    }
)

display(articles)

In [None]:

res_list = []

for i in tqdm(range(len(articles))):
    article = articles[i]
    aritcle_id = article['_id']
    pub_date = article['pub_date']
    article_url = article['web_url']
    headline = article['lead_paragraph']

    res_list.append(
        {'article_id': aritcle_id, 
         'pub_date': pub_date, 
         'webUrl': article_url, 
         'webTitle': headline
        }
    )
  
df = pd.DataFrame(res_list)
# Turn of the timezone
df['pub_date'] = df['pub_date'].dt.tz_localize(None)

display(df.head())
display(df.info())
display(df.describe())

### Save data to folder

In [None]:
# Enter filename here:
file_name = 'NYT_CrudeANDOil_2.csv'
# Enter relative path for saving the file:
relative_path = 'data/news'

df.to_csv(save_path(relative_path, file_name), index=False)