In [1]:
!pip install gdelt

Collecting gdelt
  Downloading https://files.pythonhosted.org/packages/65/f9/a3d5111c8f17334b1752c32aedaab0d01ab4324bf26417bd41890d5b25d0/gdelt-0.1.10.6.1-py2.py3-none-any.whl (773kB)
[K    100% |████████████████████████████████| 778kB 1.7MB/s eta 0:00:01
[?25hCollecting pandas>=0.20.3 (from gdelt)
  Downloading https://files.pythonhosted.org/packages/74/24/0cdbf8907e1e3bc5a8da03345c23cbed7044330bb8f73bb12e711a640a00/pandas-0.24.2-cp35-cp35m-manylinux1_x86_64.whl (10.0MB)
[K    100% |████████████████████████████████| 10.0MB 139kB/s eta 0:00:01
Installing collected packages: pandas, gdelt
  Found existing installation: pandas 0.20.1
    Uninstalling pandas-0.20.1:
      Successfully uninstalled pandas-0.20.1
Successfully installed gdelt-0.1.10.6 pandas-0.24.2
[33mYou are using pip version 9.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!pip install --upgrade numpy

Collecting numpy
  Downloading https://files.pythonhosted.org/packages/f6/f3/cc6c6745347c1e997cc3e58390584a250b8e22b6dfc45414a7d69a3df016/numpy-1.16.3-cp35-cp35m-manylinux1_x86_64.whl (17.2MB)
[K    100% |████████████████████████████████| 17.2MB 82kB/s  eta 0:00:01
[?25hInstalling collected packages: numpy
  Found existing installation: numpy 1.11.3
    Uninstalling numpy-1.11.3:
      Successfully uninstalled numpy-1.11.3
Successfully installed numpy-1.16.3
[33mYou are using pip version 9.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import os
import datetime
import dateutil

import pandas as pd
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

import gdelt

In [2]:
def mkdirs(year, month, day):
    """
    Make Hive-style partitioned directories for data.
    
    format for directories: pwd/year=2019/month=4/day=21/
    """
    dirpath = ['year=' + str(year), '/month=' + str(month), '/day=' + str(day)]
    
    for i in range(1,4):    
        d = ''.join(dirpath[:i])
        if not os.path.isdir(d):
            os.mkdir(d)


def fetch_gdelt_events(year, month, day):
    """
    Fetch all the GDELT events for a given day and return the Pandas dataframe.
    """
    gd2 = gdelt.gdelt(version=2)
    date = str(year) + '-' + str(month).zfill(2) + '-' + str(day).zfill(2)
    results = gd2.Search(date=date, table='events', coverage=True, translation=False)
    
    print("Fetched %s events for %s-%s-%s" % (len(results), year, month, day))
    
    return(results)


def pandas_to_json_file(df, filepath):
    """
    Convert the dataframe df row-by-row to JSON and save to filepath.
    """
    fp = open(filepath,'w')
    for row in df.iterrows():
        row[1].to_json(fp)
        fp.write('\n')
    fp.close()
    
    
def fetch_and_store(datetime_day):
    """
    Fetch GDELT events data and store locally as JSON.
    
    datetime_day is a Python datetime object.
    """
    year = datetime_day.year
    month = datetime_day.month
    day = datetime_day.day
    
    # Make the directories if needed
    mkdirs(year=year, month=month, day=day)
    
    # Fetch GDELT data
    df = fetch_gdelt_events(year=year, month=month, day=day)
    
    # Store as JSON
    filename = 'gdelt_' + str(year) + str(month).zfill(2) + str(day).zfill(2) + '.json'
    filepath = 'year=' + str(year) + '/month=' + str(month) + '/day=' + str(day) + '/' + filename
    pandas_to_json_file(df=df, filepath=filepath)
    
    
def fetch_and_store_range(start_date, end_date):
    """
    Fetch an interval of GDELT data.
    
    start_date and end_date are text, something like '2019-01-01'.
    """
    current_date = dateutil.parser.parse(start_date)
    end_date = dateutil.parser.parse(end_date)

    while current_date <= end_date:
        fetch_and_store(datetime_day=current_date)
        current_date = current_date + datetime.timedelta(days=1)

In [3]:
fetch_and_store_range(start_date='2018-12-01', end_date='2018-12-31')

Fetched 129609 events for 2018-12-1
Fetched 104158 events for 2018-12-2
Fetched 170433 events for 2018-12-3
Fetched 195093 events for 2018-12-4
Fetched 195348 events for 2018-12-5
Fetched 195534 events for 2018-12-6
Fetched 180675 events for 2018-12-7
Fetched 108704 events for 2018-12-8
Fetched 93221 events for 2018-12-9
Fetched 156294 events for 2018-12-10
Fetched 178492 events for 2018-12-11
Fetched 187741 events for 2018-12-12
Fetched 178775 events for 2018-12-13
Fetched 178755 events for 2018-12-14
Fetched 109223 events for 2018-12-15
Fetched 93859 events for 2018-12-16
Fetched 152293 events for 2018-12-17
Fetched 174002 events for 2018-12-18
Fetched 180182 events for 2018-12-19
Fetched 183135 events for 2018-12-20
Fetched 173225 events for 2018-12-21
Fetched 103245 events for 2018-12-22
Fetched 86031 events for 2018-12-23
Fetched 108406 events for 2018-12-24
Fetched 79953 events for 2018-12-25
Fetched 104000 events for 2018-12-26
Fetched 130108 events for 2018-12-27
Fetched 134533