In [None]:
import pandas as pd
import sys
from pynytimes import NYTAPI
import datetime as dt

from tqdm.notebook import tnrange

# Import utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *
from db_utils import *


### NY Times Developer Platform API


**API Call limit:** API: 500 requests per day and 5 requests per minute. You should sleep 12 seconds between calls to avoid hitting the per minute rate limit. 

**Pagination:** The Article Search API returns a max of 10 results at a time. The meta node in the response contains the total number of matches ("hits") and the current offset. Use the page query parameter to paginate thru results (page=0 for results 1-10, page=1 for 11-20, ...). You can paginate thru up to 100 pages (1,000 results). If you get too many results try filtering by date range.

In [None]:
with open('API_keys/NYT_API_KEY.txt', 'r') as file:
    API_KEY = file.read()

nyt = NYTAPI(API_KEY, parse_dates=True)

query = 'Crude AND Oil'

year = 2003

articles = nyt.article_search(
    query=query,
    results=10000,
    dates={
        'begin': dt.datetime(year, 1, 1), 
        'end': dt.datetime(year, 12, 31)
    },
    options={
        'sort': 'newest'
    }
)

display(articles)

In [None]:
res_list = []

for i in tnrange(len(articles)):
    article = articles[i]
    aritcle_id = article['_id']
    pub_date = article['pub_date']
    article_url = article['web_url']
    headline = article['abstract']

    res_list.append(
        {'article_id': aritcle_id,
         'headline': headline, 
         'datetime': pub_date, 
         'web_url': article_url, 
        }
    )
  
df = pd.DataFrame(res_list)
# Turn of the timezone
df['datetime'] = df['datetime'].dt.tz_localize(None)

# add source tag
df['source'] = 'NYT'
# add query tag
df['query'] = query.replace(" ", "")

display(df)
display(df.info())
display(df.describe())

### Save data to folder

In [None]:
# Enter filename here:
file_name = f'NYT_{query.replace(" ", "")}_{year}.csv'
# Enter relative path for saving the file:
relative_path = 'data/news'

df.to_csv(save_path(relative_path, file_name), index=True)

### Add to database

In [None]:
df = load_df('news', 'NYT_CrudeANDOil.csv')

df = df.reset_index()

news_db_commit(df, 'news')

db_info(show_table=True)

### Combiner

In [None]:
# # combine csv files

run_combiner = False

if run_combiner:
    df = load_df('news', 'NYT_CrudeANDOil_2004.csv')

    for i in range(5, 24):

        if i < 10:
            i = f'0{i}'

        df = pd.concat([df, load_df('news', f'NYT_CrudeANDOil_20{i}.csv')])

    # Enter filename here:
    file_name = f'NYT_CrudeANDOil.csv'
    # Enter relative path for saving the file:
    relative_path = 'data/news'

    df.to_csv(save_path(relative_path, file_name), index=True)