In [1]:
import json
import requests
import pandas as pd
from IPython.display import display

Read in unigrams and NYT API key.

In [2]:
unigrams=[]
for i in range(6):
    with open("/home/cline/NYT_SPEED/Unigrams/{0}.json".format(i)) as f:
        unis = json.load(f)
    unigrams.extend(unis)

In [3]:
with open("NYT.key") as key:
    for line in key:
        NYT_KEY = line

The usual functions to get solr and NYT data

In [4]:
def get_solr_data(uni_id):
    
    ## uni id is in [0, 1, ..., 53580]

    aid = unigrams[uni_id]['aid']
    query = "http://localhost:8983/solr/derived/select?q=aid%3A{0}&wt=json&indent=true".format(aid)
    r = requests.get(query)
    
    return(r)

In [5]:
def get_NYT_data(r, NYT_KEY):
    ttl = r.json()['response']['docs'][0]['title']
    date = r.json()['response']['docs'][0]['publication_date'][:10]
    nytq = 'http://api.nytimes.com/svc/search/v2/articlesearch.json?\
fq=headline:("{0}") AND pub_date:{1}&api-key={2}'.format(ttl,date, NYT_KEY)
    nyt = requests.get(nytq)
    return(nyt)

This function will get all the relevant geolocation data from the Solr index and NYT api

In [6]:
def get_geo_data(uni_id, NYT_KEY):
    
    # make dataframe from Solr data
    keys = ['geolocation', 'geolocation_locations', 'geolocation_probabilities']

    r = get_solr_data(uni_id)
    js = r.json()['response']['docs'][0]
    try:
        df = pd.DataFrame([js[key] for key in keys]).T
        df.columns = keys
        df
    except KeyError:
        df = pd.DataFrame(columns=keys)
    
    # get extracted locs
    try:
        extracted = js['extracted_locations']
    except KeyError:
        extracted=[]
    
    # get nyt location info
    nyt_meta = get_NYT_data(r, NYT_KEY).json()['response']['docs'][0]
    nyt_glocs = [x['value'] for x in nyt_meta['keywords'] if x['name'] == 'glocations']
    
    snp = nyt_meta['snippet']
    
    return(df, extracted, nyt_glocs, snp)
    

Run the cells below to print the extracted location information for a given article

In [7]:
def get_geo_report(uni_id, NYT_KEY):

    df, x, n, s = get_geo_data(uni_id, NYT_KEY)

    print('Geolocations from Solr')
    print(40*"-")
    display(df.sort_values('geolocation_probabilities', ascending=False).iloc[0:5,:])
    print('\n')

    print('Extracted Locations from Solr')
    print(40*"-")
    print(x)
    print('\n')

    print('Geolocations from NYT API')
    print(40*"-")
    print(n)
    print('\n')
    
    print('Snippet')
    print(40*"-")
    print(s)

In [None]:
get_geo_report(9, NYT_KEY)