## Example Python script for downloading articles from the Event Registry database

As a first step we import all necessary libraries and define a search query.

In [None]:
import csv, os
from eventregistry import *
import datetime

#Please register on eventregistry.org and update the API Key.
er = EventRegistry(apiKey = "PUT YOUR API KEY HERE")

## TWO ALTERNATIVES
#1) QUERY BASED ON ARTICLE CATEGORIES
q = QueryArticlesIter(conceptUri = QueryItems.OR([er.getConceptUri("Politics"),er.getConceptUri("Election"),er.getConceptUri("Referendum"),er.getConceptUri("Elections in Italy"),\
                      er.getConceptUri("Political party"),er.getConceptUri("Political campaign"),er.getConceptUri("Referendums in Italy"),er.getConceptUri("Politician"),er.getConceptUri("General election")]),\
                      dateStart = datetime.date(2016, 10, 3), dateEnd = datetime.date(2016, 12, 4),\
                      isDuplicateFilter = "skipDuplicates",\
                      lang = "ita")

#2) QUERY BASED ON KEYWORDS CONTAINED IN THE ARTICLE TEXT
q = QueryArticlesIter(keywords =QueryItems.AND(["salvini", "carola"]), \
                      dateStart = datetime.date(2019, 7, 1), dateEnd = datetime.date(2019, 7, 19),\
                      isDuplicateFilter = "skipDuplicates",\
                      lang = "ita")
print(q)

Now we can run the search query and save the data in a csv file.

In [None]:
#Replace chars that might generate problems
import re
def replacechar (text):
    rep = {"\n": " ", 
           ",": " ", 
           ";":" ", 
           "|": " ",
           "\r": " "} 
    rep = dict((re.escape(k), v) for k, v in rep.items()) 
    pattern = re.compile("|".join(rep.keys()))
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
    return text

#Create a CSV file
ff = open("XXX.csv",'w', encoding='utf8', newline='')
wr = csv.writer(ff, delimiter = '|', quotechar = '^')
wr.writerow(['Title', 'Body', 'Date', 'Language', 'Source', 'Category', 'Concepts', 'Sentiment', 'IsDuplicate', 'Url'])

#Sort articles by date and define their max number (multiples of 100)
for art in q.execQuery(er, sortBy = "date", maxItems = 10000):
    try:
        title = replacechar(art['title'])
    except:
        title = ""
        
    try:
        body = replacechar(art['body'])
    except:
        body = ""
        
    try:
        date = replacechar(art['date'])
    except:
        date = ""
        
    try:
        source = replacechar(art['source']['title'])
    except:
        source = ""
        
    try:
        lang = replacechar(art['lang'])
    except:
        lang = ""
        
    try:
        sent = art['sentiment']
    except:
        sent = ""
        
    try:
        cat = art['categories'].replace(",", "//").replace('"', " ").replace('\r', ' ').replace('\n', ' ').replace('|', ' ')
    except:
        cat = ""
        
    try:
        concept = art['concepts'].replace(",", "//").replace('"', " ").replace('\r', ' ').replace('\n', ' ').replace('|', ' ')
    except:
        concept = ""      
    
    newline = [title, body, date, lang, source, cat, concept, sent, str(art['isDuplicate']), art['url'] ]
    wr.writerow(newline)
    ff.flush()
    
ff.close()
print("Done!")