# Query GNI via Solr

Jenna Jordan

22 January 2020

In [1]:
import pandas as pd
import requests
import time

## Create functions

- `query_solr()` will take a plain text query and the desired fields and return the results of the query as a list of dicts
- `to_daily_timeseries()` will then take this list of dicts (with the presumed fields of `aid`, `publication_date`, and `publisher`) and transform it into a timeseries, with article counts for each day. A column name for the query should be specified.

In [2]:
ts_fields = ['aid', 'publication_date', 'publisher']

In [3]:
def query_solr(query:str, fields:list):
        
    # the url to send the query to
    baseurl = "http://kaboodle.clinecenter.illinois.edu:8983/solr/index/select?"
    
    # incorporate parameters
    q = "q=" + query + "AND source_name:BulkLexisNexis"
    fl = "fl=" + ",".join(fields)
    
    # define initial url for query
    rows = 0
    init_query = baseurl + fl + "&" + q + "&rows=" + str(rows)
    
    # figure out how many results to get
    time.sleep(5)
    init_query_results = requests.get(init_query)
    
    # exception handling in case something goes wrong with the query
    if init_query_results.status_code == 200:
        records_found = init_query_results.json()['response']['numFound']
    else:
        records=[]
    
    # now get the actual data
    rows = records_found
    final_query = baseurl + fl + "&" + q + "&rows=" + str(rows)
    
    time.sleep(5)
    final_query_results = requests.get(final_query)
    
    # exception handling in case something goes wrong with the query
    if final_query_results.status_code == 200:
        records = final_query_results.json()['response']['docs']
    else:
        records = []
    
    
    if len(records) > 0:
        print("Success! Your query returned " + str(records_found) + " documents.")
        return records
    else:
        print("Something went wrong.")

In [4]:
AP_daterange = pd.date_range(start='1977-01-01', end='2019-08-18')
SWB_daterange = pd.date_range(start='1979-01-01', end='2019-08-18')
AFP_daterange = pd.date_range(start='1991-05-05', end='2019-08-18')
XGNS_daterange = pd.date_range(start='1977-01-01', end='2019-08-18')
NYT_daterange = pd.date_range(start='1980-06-01', end='2019-08-18')
WP_daterange = pd.date_range(start='1977-01-01', end='2019-08-18')
UPI_daterange = pd.date_range(start='1980-09-26', end='2019-08-16')
DPA_daterange = pd.date_range(start='1994-07-03', end='2019-08-18')
IPS_daterange = pd.date_range(start='2010-01-13', end='2019-07-17')

publishers = [{'name': 'BBC Monitoring: International Reports', 'abbr': 'SWB', 'dates': SWB_daterange}, 
              {'name': 'The New York Times', 'abbr': 'NYT', 'dates': NYT_daterange},
              {'name': 'The Washington Post', 'abbr': 'WP', 'dates': WP_daterange},
              {'name': 'The Associated Press', 'abbr': 'AP', 'dates': AP_daterange},
              {'name': 'Agence France Presse - English', 'abbr': 'AFP', 'dates': AFP_daterange},
              {'name': 'Xinhua General News Service', 'abbr': 'XGNS', 'dates': XGNS_daterange},
              {'name': 'UPI (United Press International)', 'abbr': 'UPI', 'dates': UPI_daterange},
              {'name': 'dpa international (Englischer Dienst)', 'abbr': 'DPA', 'dates': DPA_daterange},
              {'name': 'Inter Press Service', 'abbr': 'IPS', 'dates': IPS_daterange}]

all_dfs = []
for pub in publishers:
    df = pd.DataFrame(pub['dates'], columns=['publication_date'])
    df['publisher'] = pub['name']
    df['publisher'] = df['publisher'].astype('category')
    all_dfs.append(df)
base_ts = pd.concat(all_dfs)

def to_daily_timeseries(results:list, query_name:str):
    df = pd.DataFrame(results)
    df['publication_date'] = pd.to_datetime(df['publication_date'].astype('str').str[:10], format='%Y-%m-%d')
    df_gb = df.groupby(['publication_date', 'publisher']).agg({'aid':'nunique'}) \
            .reset_index().rename(columns={'aid':query_name})
    final_df = base_ts.merge(df_gb, on=['publication_date', 'publisher'], how='left').fillna(0) \
               .sort_values(['publication_date', 'publisher']).set_index(['publication_date', 'publisher'])
    return final_df

In [5]:
def run_all_queries(queries:list):

    base_df = base_ts.sort_values(['publication_date', 'publisher']).set_index(['publication_date', 'publisher'])
    
    # loop through the queries, merging each into the base and then updating the base
    for q in queries:
        result = query_solr(q['query'], ts_fields)
        new_table = to_daily_timeseries(result, q['name'])
        base_df = base_df.merge(new_table, left_index=True, right_index=True, how='left')
    
    return base_df

## Queries

- only the content query should be included - the corpus is already specified in the function
- enclose the entire query field in triple quotes
- enclose the entire content query in parentheses
- no spaces in the query name

In [6]:
all_queries = [
    {'name': 'BLN_total', 
     'query': 
    """
    (content:*)
    """},
    {'name': 'climate_change', 
     'query': 
    """
    (content:"climate change")
    """},
     {'name': 'global_warming', 
     'query': 
    """
    (content:"global warming")
    """},
     {'name': 'pollinator_population', 
     'query': 
    """
    (content:(insect* OR pollinator* OR bee* OR honeybee* OR butterfl* OR moth*) AND (population OR *diversity OR biomass OR ecolog* OR ecosystem* OR entomolog*))
    """},
    {'name': 'pollinator_crisis', 
     'query': 
    """
    (content:(insect* OR pollinator* OR bee* OR honeybee* OR butterfl* OR moth*) AND (population OR *diversity OR biomass OR ecolog* OR ecosystem* OR entomolog*) AND (crisis OR "colony collapse" OR apocalypse OR extinct* OR declin* OR drop OR decreas* OR disappear*))
    """}
]

In [7]:
df = run_all_queries(all_queries)

Success! Your query returned 33381218 documents.
Success! Your query returned 157645 documents.
Success! Your query returned 76736 documents.
Success! Your query returned 918284 documents.
Success! Your query returned 304088 documents.


In [8]:
df = df.reset_index()

In [9]:
pubmap = {}
for pub in publishers:
    pubmap[pub['name']]= pub['abbr']

In [10]:
df.publisher = df.publisher.map(pubmap)

In [11]:
df

Unnamed: 0,publication_date,publisher,BLN_total,climate_change,global_warming,pollinator_population,pollinator_crisis
0,1977-01-01,AP,9.0,0.0,0.0,3.0,0.0
1,1977-01-01,WP,112.0,0.0,0.0,6.0,4.0
2,1977-01-01,XGNS,39.0,0.0,0.0,1.0,1.0
3,1977-01-02,AP,37.0,0.0,0.0,2.0,1.0
4,1977-01-02,WP,195.0,0.0,0.0,4.0,0.0
...,...,...,...,...,...,...,...
113056,2019-08-18,AP,202.0,1.0,1.0,8.0,2.0
113057,2019-08-18,NYT,372.0,9.0,1.0,47.0,23.0
113058,2019-08-18,WP,349.0,19.0,4.0,30.0,14.0
113059,2019-08-18,XGNS,223.0,3.0,0.0,7.0,2.0


In [12]:
df.to_csv("../Data/queries_4Feb.csv", index=False)