# Query GNI via Solr

Jenna Jordan

22 January 2020 - 12 February 2020

### Purpose

This notebook collects data from the Global News Index (GNI).

In [1]:
import pandas as pd

In [2]:
from query_solr_functions import query_solr, to_daily_timeseries, run_all_queries, filter_results

## Prep for querying BulkLexisNexis corpus

- specify fields to query
- create the base time-series, which the to_daily_timeseries & run_all_queries functions require
- key for converting publishers to acronyms

In [3]:
ts_fields = ['aid', 'publication_date', 'publisher']

In [4]:
AP_daterange = pd.date_range(start='1977-01-01', end='2019-08-18')
SWB_daterange = pd.date_range(start='1979-01-01', end='2019-08-18')
AFP_daterange = pd.date_range(start='1991-05-05', end='2019-08-18')
XGNS_daterange = pd.date_range(start='1977-01-01', end='2019-08-18')
NYT_daterange = pd.date_range(start='1980-06-01', end='2019-08-18')
WP_daterange = pd.date_range(start='1977-01-01', end='2019-08-18')
UPI_daterange = pd.date_range(start='1980-09-26', end='2019-08-16')
DPA_daterange = pd.date_range(start='1994-07-03', end='2019-08-18')
IPS_daterange = pd.date_range(start='2010-01-13', end='2019-07-17')

publishers = [{'name': 'BBC Monitoring: International Reports', 'abbr': 'SWB', 'dates': SWB_daterange}, 
              {'name': 'The New York Times', 'abbr': 'NYT', 'dates': NYT_daterange},
              {'name': 'The Washington Post', 'abbr': 'WP', 'dates': WP_daterange},
              {'name': 'The Associated Press', 'abbr': 'AP', 'dates': AP_daterange},
              {'name': 'Agence France Presse - English', 'abbr': 'AFP', 'dates': AFP_daterange},
              {'name': 'Xinhua General News Service', 'abbr': 'XGNS', 'dates': XGNS_daterange},
              {'name': 'UPI (United Press International)', 'abbr': 'UPI', 'dates': UPI_daterange},
              {'name': 'dpa international (Englischer Dienst)', 'abbr': 'DPA', 'dates': DPA_daterange},
              {'name': 'Inter Press Service', 'abbr': 'IPS', 'dates': IPS_daterange}]

all_dfs = []
for pub in publishers:
    df = pd.DataFrame(pub['dates'], columns=['publication_date'])
    df['publisher'] = pub['name']
    df['publisher'] = df['publisher'].astype('category')
    all_dfs.append(df)
base_ts = pd.concat(all_dfs)

In [5]:
pubmap = {}
for pub in publishers:
    pubmap[pub['name']]= pub['abbr']

### Query to get total article counts within BLN

Note: this is seperate so that it doesn't have to be re-run each time a new query is added (because this one takes the longest)

In [None]:
biggest_query = [
    {'name': 'BLN_total', 
     'query': 
    """
    (content:*) AND source_name:BulkLexisNexis
    """}
]

In [None]:
bln_df = run_all_queries(biggest_query, ts_fields, base_ts)

In [None]:
bln_df = bln_df.reset_index()

In [None]:
bln_df.publisher = bln_df.publisher.map(pubmap)

In [None]:
bln_df.to_csv("../Data/bln_daily_total.csv", index=False)

### All Queries

In [7]:
all_queries = [
    {'name': 'insect_population',
    'query': 
    """
    (content: (insect OR pollinator OR bee OR honeybee OR moth) AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*) AND (study OR professor OR experiment OR research OR analysis OR data)) AND (source_name:BulkLexisNexis)
    """},
    {'name': 'insect_decline',
    'query': 
    """
    (content: (insect OR pollinator OR bee OR honeybee OR moth) AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*) AND (study OR professor OR experiment OR research OR analysis OR data) AND (crisis OR "colony collapse" OR apocalypse OR armageddon OR extinct OR "insect decline"~5 OR "insect drop"~5 OR "insect decrease"~5 OR "insect disappear"~5 OR "population decline"~5 OR "population drop"~5 OR "population decrease"~5 OR "population disappear"~5 OR "abundance decline"~5 OR "abundance drop"~5 OR "abundance decrease"~5 OR "abundance disappear"~5)) AND (source_name:BulkLexisNexis)
    """},
    {'name': 'pollinator_population',
    'query': 
    """
    (content: ((insect AND pollinator) OR (bee OR honeybee OR moth)) AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*) AND (study OR professor OR experiment OR research OR analysis OR data)) AND (source_name:BulkLexisNexis)
    """},
    {'name': 'pollinator_decline',
    'query': 
    """
    (content: ((insect AND pollinator) OR (bee OR honeybee OR moth)) AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*) AND (study OR professor OR experiment OR research OR analysis OR data) AND (crisis OR "colony collapse" OR apocalypse OR armageddon OR extinct OR "insect decline"~5 OR "insect drop"~5 OR "insect decrease"~5 OR "insect disappear"~5 OR "population decline"~5 OR "population drop"~5 OR "population decrease"~5 OR "population disappear"~5 OR "abundance decline"~5 OR "abundance drop"~5 OR "abundance decrease"~5 OR "abundance disappear"~5)) AND (source_name:BulkLexisNexis)
    """},
    {'name': 'insect_apocalypse', 
     'query': 
    """
    (content:"insect apocalypse"~5 OR "insect armageddon"~5 OR "beepocalypse") AND source_name:BulkLexisNexis
    """},
     {'name': 'colony_collapse', 
     'query': 
    """
    (content:"colony collapse" AND (bee OR honeybee)) AND source_name:BulkLexisNexis
    """},
    {'name': 'climate_change', 
     'query': 
    """
    (content:"climate change" OR "global warming") AND source_name:BulkLexisNexis
    """},
    {'name': 'climate_change_IPCCreport', 
     'query': 
    """
    (content:("climate change" OR "global warming") AND ("IPCC" OR "Intergovernmental Panel on Climate Change") AND report) AND source_name:BulkLexisNexis
    """},
    {'name': 'insect_population_studies', 
     'query': 
    """
    (content: ("Krefeld" OR "the German study" OR "Hans de Kroon" OR "Martin Sorg" OR "Werner Stenmans" OR "Dave Goulson" OR "Brad Lister" OR "Andres Garcia" OR "the Puerto Rico study" OR "S?nchez-Bayo" OR "Wyckhuys" OR "Rob Dunn" OR "David Wagner" OR "Chris Thomas" OR "Anders Tottrup" OR "Kevin Gaston" OR "Chris Thomas" OR "Roel van Klink" OR "Arthur Shapiro" OR "Aletta Bonn" OR "E.O. Wilson") AND (insect OR pollinator OR bee OR honeybee OR moth) AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*) AND (study OR professor OR experiment OR research OR analysis OR data)) AND (source_name:BulkLexisNexis)
    """}
]

note: this list of query dictionaries is meant to be used with the filter_results function (and when prune=True for run_all_queries).

After running a statistical analysis to compare pruned vs un-pruned data, we decided to not prune the data because there was no effect on the final analysis, and explaining the pruning process would be too complicated.

In [None]:
all_queries_pruned = [
    {'name': 'insect_population_pruned',
    'query': 
    """
    (content: (insect OR pollinator OR bee OR honeybee OR moth)^5 AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*)^5 AND (study OR professor OR experiment OR research OR analysis OR data)) AND (source_name:BulkLexisNexis)^0.00001
    """, 
    'add_fields': ['score'],
    'filter_method': ('score', 35.73716)},
    {'name': 'insect_decline_pruned',
    'query': 
    """
    (content: (insect OR pollinator OR bee OR honeybee OR moth)^5 AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*)^5 AND (study OR professor OR experiment OR research OR analysis OR data) AND (crisis OR "colony collapse" OR apocalypse OR armageddon OR extinct OR "insect decline"~5 OR "insect drop"~5 OR "insect decrease"~5 OR "insect disappear"~5 OR "population decline"~5 OR "population drop"~5 OR "population decrease"~5 OR "population disappear"~5 OR "abundance decline"~5 OR "abundance drop"~5 OR "abundance decrease"~5 OR "abundance disappear"~5)) AND (source_name:BulkLexisNexis)^0.00001
    """, 
    'add_fields': ['score'],
    'filter_method': ('score', 34.896996)},
    {'name': 'pollinator_population_pruned',
    'query': 
    """
    (content: ((insect AND pollinator) OR (bee OR honeybee OR moth))^5 AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*)^5 AND (study OR professor OR experiment OR research OR analysis OR data)) AND (source_name:BulkLexisNexis)^0.00001
    """, 
    'add_fields': ['score'],
    'filter_method': ('score', 32.83181)},
    {'name': 'pollinator_decline_pruned',
    'query': 
    """
    (content: ((insect AND pollinator) OR (bee OR honeybee OR moth))^5 AND ("insect population"~5 OR "pollinator population"~5 OR "bee population"~5 OR "honeybee population"~5 OR "moth population"~5 OR "biological diversity" OR biodiversity OR biomass OR ecolog* OR ecosystem* OR entomolog*)^5 AND (study OR professor OR experiment OR research OR analysis OR data) AND (crisis OR "colony collapse" OR apocalypse OR armageddon OR extinct OR "insect decline"~5 OR "insect drop"~5 OR "insect decrease"~5 OR "insect disappear"~5 OR "population decline"~5 OR "population drop"~5 OR "population decrease"~5 OR "population disappear"~5 OR "abundance decline"~5 OR "abundance drop"~5 OR "abundance decrease"~5 OR "abundance disappear"~5)) AND (source_name:BulkLexisNexis)^0.00001
    """, 
    'add_fields': ['score'],
    'filter_method': ('score', 30.817253)},
    {'name': 'climate_change_pruned', 
     'query': 
    """
    (content:"climate change" OR "global warming") AND source_name:BulkLexisNexis
    """, 
    'add_fields': ['termfreq(content, climate)', 'termfreq(content, warming)'],
    'filter_method': ('termfreq', 2)}
]

## Create Time-Series dataset

In [8]:
query_df = run_all_queries(all_queries, ts_fields, base_ts)

Success! Your query returned 9948 documents.
Success! Your query returned 2223 documents.
Success! Your query returned 3842 documents.
Success! Your query returned 1063 documents.
Success! Your query returned 34 documents.
Success! Your query returned 481 documents.
Success! Your query returned 196365 documents.
Success! Your query returned 4679 documents.
Success! Your query returned 118 documents.


In [9]:
query_df = query_df.reset_index()

In [10]:
query_df.publisher = query_df.publisher.map(pubmap)

In [11]:
query_df.to_csv("../Data/query_results_bln-ts_26Feb.csv", index=False)

## Create article-level dataset to compare articles across queries

In [12]:
article_dfs = []
for q in all_queries:
    result = query_solr(q['query'], ['aid', 'publisher', 'publication_date', 'title', 'url'])
    qname = q['name']
    result[qname] = 1
    article_dfs.append(result)

Success! Your query returned 9948 documents.
Success! Your query returned 2223 documents.
Success! Your query returned 3842 documents.
Success! Your query returned 1063 documents.
Success! Your query returned 34 documents.
Success! Your query returned 481 documents.
Success! Your query returned 196365 documents.
Success! Your query returned 4679 documents.
Success! Your query returned 118 documents.


In [13]:
article_df = article_dfs[0]
for df in article_dfs[1:]:
    article_df = article_df.merge(df, on=['aid', 'publication_date', 'publisher', 'title', 'url'], how = 'outer')
article_df = article_df.fillna(0)

In [14]:
article_df = article_df.astype(int, errors='ignore')

In [15]:
article_df.publisher = article_df.publisher.map(pubmap)

In [18]:
article_df['publication_date'] = pd.to_datetime(article_df['publication_date'].astype('str').str[:10], format='%Y-%m-%d', errors='coerce')

In [16]:
bad_publisher_mask = article_df[article_df['publisher'].isin(['IPS', 'UPI', 'SWB'])].index
article_df = article_df.drop(bad_publisher_mask)

In [23]:
bad_date_mask = article_df[article_df['publication_date'].isna()].index
article_df = article_df.drop(bad_date_mask)

In [25]:
article_df = article_df.sort_values(by=['publisher', 'publication_date'])

In [26]:
article_df

Unnamed: 0,title,publisher,publication_date,aid,url,insect_population,insect_decline,pollinator_population,pollinator_decline,insect_apocalypse,colony_collapse,climate_change,climate_change_IPCCreport,insect_population_studies
120020,Forty dead in flooding in eastern Turkey,AFP,1991-05-17,20190301235543664,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
120022,Czechoslovakia lighthouse of reform: minister,AFP,1991-05-31,20190301235551744,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
149257,"Oil consumers to boost stocks, better relation...",AFP,1991-06-03,20190302000102692,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
120021,Rising seas pose major threat to Pacific islands,AFP,1991-06-09,20190301235841326,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
29148,Indonesian group criticizes U.S. over global w...,AFP,1991-06-11,20190301235840927,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55678,Ex-Ethiopian PM urges Africa to embrace tech t...,XGNS,2019-08-16,20190830211535140,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
53822,"Xinhua Asia-Pacific news summary at 1600 GMT, ...",XGNS,2019-08-17,20190831000249345,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
129066,"1st LD: China, France should work together to ...",XGNS,2019-08-18,20190901000258420,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0
147207,Feature: Italy agricultural sector facing setb...,XGNS,2019-08-18,20190901000258414,https://advance.lexis.com/api/document?collect...,0,0,0,0,0,0,1,0,0


In [27]:
article_df.to_csv("../Data/Analyze/BLNqueries_compare_article-level_26Feb.csv", index=False)

In [22]:
article_df['aid'].to_csv("../Data/Metadata/aids_for_metadata_request.csv", index=False)