In [92]:
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import requests
import dateutil
import time

from collections import defaultdict
import datetime

In [93]:
def get_arcticles(times_api_key, year, month):
    base_url = 'https://api.nytimes.com/svc/archive/v1'
    url = base_url + '/' + year + '/' + month + '.json?api-key=' + times_api_key
    print(url)
    response = requests.get(url).json()
    return response['response']['docs']

In [94]:
times_api_key = 'Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH'

In [95]:
# Determine all year, month pairs in the time range of the data we would like to collect
end = datetime.date(2020, 9, 29)
start = datetime.date(2017, 1, 1)
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [96]:
months_in_range[0:5]

[['2017', '1'], ['2017', '2'], ['2017', '3'], ['2017', '4'], ['2017', '5']]

In [98]:
# Get all articles within the time range
all_articles = []
for year, month in months_in_range:
    monthly_articles = get_arcticles(times_api_key, year, month)
    time.sleep(10) # can only make at most 10 calls per minute
    all_articles.extend(monthly_articles)

https://api.nytimes.com/svc/archive/v1/2017/1.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/2.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/3.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/4.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/5.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/6.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/7.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/8.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/9.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/10.json?api-key=Hkwb5XWIFRzVeUuSjdi7MrteLTRRXnkH
https://api.nytimes.com/svc/archive/v1/2017/11.json?api-key=Hkwb5XWIFRzVeUuSjdi

In [99]:
print('Total Number of Articles Pulled:', len(all_articles))

Total Number of Articles Pulled: 319064


In [102]:
# Example article headline - snippet corresponds to article headline
all_articles[0]['snippet']

'Some residents of Rutland, Vt., see the expected arrival of 100 refugees as an economic boon, while others are not as optimistic.'

In [103]:
# Load the tesnorflowhub embedding layer - Universal Sentence Encoder V4 - outputs a 512-dimensional embedding vector
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [108]:
def embed_articles(articles):
    
    date_to_embedding_vector = {}
    date_to_num_articles = {}
    for article in articles:
        
        # Only parse articles with the requisite data
        if 'pub_date' not in article or 'snippet' not in article:
            continue
        
        # Get the embedding vector representing the headline
        date = dateutil.parser.parse(article['pub_date']).date()
        snippet = article['snippet']
        embedding_vector = embed([snippet])
        
        if date in date_to_embedding_vector:
            date_to_embedding_vector[date] = np.add(date_to_embedding_vector[date], embedding_vector)
            date_to_num_articles[date] += 1
        else:
            date_to_embedding_vector[date] = embedding_vector
            date_to_num_articles[date] = 1
            
    
    # Now that we are done adding all articles to the daily embedding vectors...
    # Go ahead and average each embedding vector (divide it by the number of articles that contributed to the vector)
    # This will prevent the number of articles published in a day from having an impact on the vector values
    for date in date_to_embedding_vector:
        num_articles = date_to_num_articles[date]
        adjusted_vector = np.divide(date_to_embedding_vector[date], num_articles)
        date_to_embedding_vector[date] = adjusted_vector
        
    return date_to_embedding_vector, date_to_num_articles

In [109]:
date_to_embedding_vector, date_to_num_articles = embed_articles(all_articles)

In [110]:
# Show how number of articles varies by day:
sorted_items = sorted(list(date_to_num_articles.items()))
sorted_items[0:10]

[(datetime.date(2017, 1, 1), 73),
 (datetime.date(2017, 1, 2), 88),
 (datetime.date(2017, 1, 3), 296),
 (datetime.date(2017, 1, 4), 180),
 (datetime.date(2017, 1, 5), 374),
 (datetime.date(2017, 1, 6), 222),
 (datetime.date(2017, 1, 7), 172),
 (datetime.date(2017, 1, 8), 77),
 (datetime.date(2017, 1, 9), 370),
 (datetime.date(2017, 1, 10), 189)]

In [111]:
# Show an example vector
date_to_embedding_vector[datetime.date(2017, 1, 2)]

array([[ 1.23099133e-04, -2.55809352e-02, -3.91115062e-03,
        -1.96585078e-02, -1.69870276e-02,  1.28033496e-02,
         7.52160838e-03, -9.58103407e-03,  9.20997653e-03,
        -2.58964710e-02, -8.85403063e-03,  1.16800163e-02,
        -4.54811798e-03, -3.72533780e-03,  1.14707174e-02,
        -2.96322946e-02, -5.69329271e-03,  5.12225088e-03,
        -1.89315935e-03, -8.89484363e-04, -1.54290320e-02,
        -2.64706486e-03,  1.41014522e-02,  7.34519400e-03,
         6.47503324e-03,  7.65292672e-03, -1.24665536e-02,
         3.65532492e-03, -5.07821608e-03, -8.94751400e-03,
         8.64684815e-04,  8.21917132e-03,  7.04236154e-04,
        -8.89855088e-04,  2.46133259e-03,  2.28818529e-03,
        -1.78483641e-03, -1.97381712e-03, -1.32329464e-02,
        -6.48285309e-03,  9.20303259e-03, -2.45539169e-03,
         7.54154753e-03, -2.69040447e-02, -1.15917679e-02,
        -6.15437655e-03,  4.80350014e-03, -6.44609053e-03,
        -2.87552574e-03, -1.67720318e-02, -5.57934307e-0

In [112]:
# Convert the date_to_embedding_vector dictionary to a pandas dataframe
embedding_vectors_df = pd.DataFrame()
for date in date_to_embedding_vector:
    vector_at_date = pd.DataFrame(date_to_embedding_vector[date], index=[date])
    embedding_vectors_df = embedding_vectors_df.append(vector_at_date)
    
# Make date a column
embedding_vectors_df.reset_index(level=0, inplace=True)
embedding_vectors_df.rename(columns={"index": "date"}, inplace=True)
embedding_vectors_df['date'] = pd.to_datetime(embedding_vectors_df['date'])

In [113]:
embedding_vectors_df.head()

Unnamed: 0,date,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,2017-01-02,0.000123,-0.025581,-0.003911,-0.019659,-0.016987,0.012803,0.007522,-0.009581,0.00921,...,-0.000628,0.020751,0.000568,0.004707,0.005168,-0.004014,-0.0072,0.00386,-0.017291,0.009441
1,2017-01-01,0.005954,-0.003841,-0.001854,-0.024854,-0.020176,0.011049,-0.004546,-0.001006,0.005869,...,0.003684,0.035374,0.016044,-0.000216,0.023061,-0.01214,-0.002986,-0.013999,-0.011632,0.008429
2,2017-01-03,0.007024,-0.021057,0.000409,-0.025842,-0.016621,0.007097,0.000726,-0.00729,0.000116,...,-0.000661,0.030381,0.013288,0.005367,0.010509,-0.006045,-0.008171,0.003855,-0.007096,0.018201
3,2017-01-04,0.004653,-0.012411,-0.001151,-0.026045,-0.011844,0.010583,6.9e-05,-0.003124,0.006726,...,0.004361,0.025416,0.006094,0.008971,0.00811,0.000185,-0.012961,0.004362,-0.004057,0.010306
4,2017-01-05,0.003516,-0.004221,0.00587,-0.024142,-0.009221,0.01148,0.009126,-0.00025,0.004909,...,0.010125,0.020535,0.008477,0.003254,0.008739,-0.000438,-0.007905,0.002326,0.000348,0.014819


In [114]:
embedding_vectors_df.to_pickle('./pickles/news_vectors.pkl')

In [115]:
# Determine if any dates are missing from the dataset
missing_dates = pd.date_range(start = '2017-01-01', end = '2020-09-29' ).difference(embedding_vectors_df.date)

In [116]:
list(missing_dates) # One missing date

[Timestamp('2018-11-30 00:00:00')]