# Scraping the GDELT dataset

What we need is a df with:
- date
- source country of article
- country article mentions
- theme(s) of article OR article text

OR
- date
- percentage of articles from source country x mentioning country y that have theme z
    - could be smoothed over time, but not backwards in time

In [2]:
import pandas as pd
import urllib.parse
import urllib.error
import pickle
import time
import sys
from IPython.display import clear_output

data_files_path = "../../data/auxiliary_data/"
gdelt_path = "../../data/gdelt/"

countries_path = f'{data_path}countries_dict.pickle'
themes_path = f'{data_path}themes_list.pickle'
countries_capitals_path = f'{data_path}countries_capitals.csv'
countries_queries_path = f'{data_path}country_queries.csv'

Then we import the auxiliary datasets

In [3]:
# open themes list
with open(themes_path, 'rb') as f:
    themes = pickle.load(f)

# open the countries and capitals csv
countries_capitals = pd.read_csv(countries_capitals_path)
# make a countries dictionary with FIPS as key
countries = countries_capitals.set_index('FIPS')['Country'].to_dict()

# open the queries csv
countries_queries = pd.read_csv(countries_queries_path)
# make a query dictionary with FIPS as key
query_dict = countries_queries.set_index('FIPS')['Query'].to_dict()

# Functions to scrape

In [18]:
def querybuilder(dict):
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc?"
    url = base_url + "&".join([f"{key}={value}" for key, value in dict.items()])
    url = urllib.parse.quote(url, safe='():/?&=').replace("&theme=", "%20theme:")
    return url

def get_gdelt_data(theme, country, start_date, end_date, verbose=0):
    
    if theme == "ALL":
        dict = {
            "query": query_dict[country],
            "mode": "TimelineSourceCountry",
            "startdatetime": start_date,
            "enddatetime": end_date,
            "format": "csv",
            "timezoom" : "yes",
        }
    else:
        dict = {
            "query": query_dict[country],
            "theme": theme,
            "mode": "TimelineSourceCountry",
            "startdatetime": start_date,
            "enddatetime": end_date,
            "format": "csv",
            "timezoom" : "yes",
        }  
        
    url = querybuilder(dict)
    
    if verbose >= 2:
        print(url.replace("csv", "html"))
    try:
        df = pd.read_csv(url)
        return df
    except pd.errors.EmptyDataError:
        if verbose >= 2:
            print("passed")
        pass

def format_seconds(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    if hours > 0:
        return f"{hours} hours, {minutes} minutes, {round(seconds)} seconds"
    elif minutes > 0:
        return f"{minutes} minutes, {round(seconds)} seconds"
    else:
        return f"{round(seconds, 2)} seconds"

def scrape_gdelt(themes, countries, start_date, end_date, save_int, all=False, verbose=0):
    df_list = []
    
    if all:
        themes.append("ALL")

    # track time left
    total_items = len(themes) * len(countries)
    count = 0
    api_call_times = []
    passed_total = 0
    if verbose >= 1:
        print(f"Total queries: {total_items}")
    # set current time
    start_time = time.time()
                
    saved = 0

    for theme in themes:
        for country in countries:
            if verbose >= 2:
                print(f"Scraping {theme} in {country}")
            
            # set time when api was called last
            last_api_call_time = time.time()

            df = get_gdelt_data(theme, country, start_date, end_date, verbose=verbose)
            
            api_call_times.append(time.time() - last_api_call_time)

            if df is not None:
                df['theme'] = theme
                df['country'] = country
                df_list.append(df)
            else:
                passed_total += 1

            count += 1

            if count % save_int == 0:
                df = pd.concat(df_list)
                df.to_csv(f'{gdelt_path}scraped_all/gdelt_data_{count-save_int}_to_{count}.csv')
                df_list = []
                saved += 1
            
            elapsed_time = time.time() - start_time
            items_per_second = count / elapsed_time
            seconds_left = (total_items - count) / items_per_second
            
            progress_str = f"Processed {count}/{total_items} queries. {round(items_per_second, 2)} Query/s. Average api time: {round(sum(api_call_times)/len(api_call_times),2)}s. On theme {theme} for {country}                           \
                            \nElapsed time: {elapsed_time:.2f} seconds. Estimated time left: {format_seconds(seconds_left)}. Saved: {saved}"
            sys.stdout.write('\x1b[A\r' + progress_str)
            sys.stdout.flush()
            
            # avoid hitting API rate limit
            if time.time() - last_api_call_time < 5:
                time.sleep(5 - (time.time() - last_api_call_time))
                
    df = pd.concat(df_list)
    df.to_csv(f'{gdelt_path}scraped_all/gdelt_data_{count-save_int}_to_{count}.csv')
    df_list = []
    saved += 1

# Testing methods

Test for a few countries and a few themes, to see how long it takes per query:

In [42]:
countries_subset = [str(item) for item in countries.keys()]
themes_subset = []

start_date_test = "20170101010000"
end_date_test = "20240301010000"

scarped_test = scrape_gdelt(themes_subset, countries_subset, start_date_test, end_date_test, all=True, verbose=1)

Processed 27/27 queries. 0.66 Query/s. Passed: 15. On theme ALL                                                                            
Elapsed time: 40.73 seconds. Estimated time left: 0.0 seconds.

We find that it takes approximately 1 second per query. Our final scraping operation will have `236` countries times `4254` themes, which is `1003944` queries. At 0.5 seconds per query this will take approximately `280` hours. Hence we will have to either limit the amount of themes or perform the scraping in chunks.

Furthermore, we find that many themes are not present in the dataset. We will have to filter out these themes. For this we will run a scrape operation across all themes with the `TimelineVol` option, to quickly determine which themes actually contain information. This will also help bring down the time it takes to scrape the entire dataset.

## Pivoting the table:

Nevertheless, we can transform the data we just retrieved into a pivot table to get it into the final format that we need.

We rename the columns and clean up the table:

In [43]:
# rename columns
scarped_test.columns = ["Date", "Source country", "Intensity", "Theme", "Target country"]

# clean up source country column
scarped_test["Source country"] = scarped_test["Source country"].str.replace(" Volume Intensity", "")

# map fip in target country column to country name with countries dictionary
scarped_test["Target country"] = scarped_test["Target country"].map(countries)

# sort on source country, date and theme
scarped_test = scarped_test.sort_values(by=["Date", "Source country", "Target country", "Theme"])

scarped_clean = scarped_test[scarped_test["Source country"] != ""]

scarped_clean.head()

Unnamed: 0,Date,Source country,Intensity,Theme,Target country
6077,2017-01-02,Afghanistan,0.6557,ALL,Austria
4897,2017-01-02,Afghanistan,0.3279,CRISISLEX_CRISISLEXREC,Austria
590,2017-01-02,Afghanistan,0.0,LEADER,Austria
1180,2017-01-02,Afghanistan,0.0,USPEC_POLITICS_GENERAL1,Austria
649,2017-01-02,Afghanistan,0.0,ALL,Azerbaijan


Then we pivot the table so that the themes move to the columns:

In [44]:
# pivot the table
scarped_test_pivot = scarped_clean.pivot_table(index=["Date", "Source country", "Target country"], columns=["Theme"], values="Intensity").reset_index()

# set the "Date" as the index
scarped_test_pivot = scarped_test_pivot.set_index("Date")

scarped_test_pivot.head()

Theme,Source country,Target country,ALL,CRISISLEX_CRISISLEXREC,LEADER,USPEC_POLITICS_GENERAL1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-02,Afghanistan,Austria,0.6557,0.3279,0.0,0.0
2017-01-02,Afghanistan,Azerbaijan,0.0,0.0,0.0,0.0
2017-01-02,Afghanistan,The Bahamas,0.0,0.0,,
2017-01-02,Albania,Austria,1.4085,0.7042,0.3521,0.7042
2017-01-02,Albania,Azerbaijan,0.0,0.0,0.0,0.0


Using this method we can create a dataframe that contains the volume of coverage in country x talking about country y, and that have theme z. This is the data that we want.

First we run the scarpe operation to get the themes that are actually present in the dataset.

## Filtering out empty themes

To filter out the empty themes, we have to customize the scrape function to scrape the `TimelineVol` option without a query specification. To reduce time we remove all items in the function that we are not interested in

In [4]:
def check_gdelt_theme(theme, start_date, end_date, verbose=0):
    dict = {
        "query" : "",
        "theme": theme,
        "mode": "TimelineVol",
        "timelinesmooth" : 0,
        "startdatetime": start_date,
        "enddatetime": end_date,
        "format": "csv",
        "timezoom" : "yes",
    }        
    url = querybuilder(dict).replace("theme=", "theme:")
    try:
        df = pd.read_csv(url)
        return True, url
    except pd.errors.EmptyDataError:
        return False, url
    except urllib.error.HTTPError as e:
        if e.code == 429:
            return "TIME ERROR", url
    except Exception as e:
        return "ERROR", e

def verify_themes_gdelt(themes, start_date, end_date, verbose=0):
    df_list = []

    # track time left
    total_items = len(themes)
    count = 0
    passed_total = 0
    passed_logs = []

    if verbose >= 1:
        print(f"Total queries: {total_items}")
    
    # set current time
    start_time = time.time()

    # time_out time
    time_out = 5

    crashed = 0
    
    for theme in themes:
        time.sleep(time_out)
        passed = False

        value, url = check_gdelt_theme(theme, start_date, end_date, verbose=verbose)

        if value == "TIME ERROR":
            print(f"TOO MANY REQUESTS after {count} queries. Sleeping 10 seconds")
            passed_logs_df = pd.DataFrame(passed_logs, columns=["Theme", "Passed", "URL"])
            passed_logs_df.to_csv(f"scraped/passed_logs_crashed_recovered_{crashed + 1}.csv")
            time.sleep(60)
            time_out += 1
            crashed += 1
        elif value == "ERROR":
            print(f"SOME ERROR, stopped at {theme}. Error message\n {url}")
            passed_logs_df = pd.DataFrame(passed_logs, columns=["Theme", "Passed", "URL"])
            passed_logs_df.to_csv(f"scraped/passed_logs_crashed_recovered_{crashed + 1}.csv")
            crashed += 1
        elif not value:
            passed_total += 1
            passed = True

        passed_logs.append((theme, passed, url.replace("csv", "html")))

        # calculate time left
        count += 1
        elapsed_time = time.time() - start_time
        items_per_second = count / elapsed_time
        seconds_left = (total_items - count) / items_per_second
        
        # print time left
        progress_str = f"Processed {count}/{total_items} queries. {round(items_per_second, 2)} Query/s. Passed: {passed_total}.                        \
                        \nTimeout: {time_out} seconds. Elapsed time: {elapsed_time:.2f} seconds. Estimated time left: {format_seconds(seconds_left)}."
        sys.stdout.write('\x1b[A\r' + progress_str)
        sys.stdout.flush()


    return passed_logs

Now we go through all themes and check if they are present in the GDELT 2.0 data. I put a 5 second timeout between each query to avoid hitting the rate limit of the API. Since we want to be able to interrupt the process without losing too much data, we will save the data to a file after each batch of 10 themes. 

In [1]:
start_date = "20170101010000"
end_date = "20240301010000"

offset = 1311
batch_size = 60

while offset < len(themes):
    print(f"Scraping themes {offset} to {offset + batch_size}\n")
    themes_subset = themes[offset:offset + batch_size]

    passed_logs = verify_themes_gdelt(themes_subset, start_date, end_date, verbose=2)

    passed_logs_df = pd.DataFrame(passed_logs, columns=["Theme", "Passed", "URL"])
    passed_logs_df.to_csv(f"{gdelt_path}scraped_all/passed_logs_{offset}_to_{offset+batch_size}.csv")

    offset += batch_size
    clear_output(wait=True)

In [35]:
# convert passed_logs to df
passed_logs_df = pd.DataFrame(passed_logs, columns=["Theme", "Passed", "URL"])
passed_logs_df.head()

Unnamed: 0,Theme,Passed,URL
0,TAX_FNCACT,True,https://api.gdeltproject.org/api/v2/doc/doc?qu...
1,TAX_ETHNICITY,True,https://api.gdeltproject.org/api/v2/doc/doc?qu...
2,EPU_POLICY,True,https://api.gdeltproject.org/api/v2/doc/doc?qu...
3,CRISISLEX_CRISISLEXREC,False,https://api.gdeltproject.org/api/v2/doc/doc?qu...
4,TAX_WORLDLANGUAGES,True,https://api.gdeltproject.org/api/v2/doc/doc?qu...



Now we are ready to perform the scraping operation on all countries and all themes.

# Scraping all data [DEMO]

First we automate the functions we wrote above to clean and pivot the table:

In [22]:
def clean_df(df):
    # rename columns
    df.columns = ["Date", "Source country", "Intensity", "Theme", "Target country"]

    # clean up source country column
    df["Source country"] = df["Source country"].str.replace(" Volume Intensity", "")

    # map fip in target country column to country name with countries dictionary
    df["Target country"] = df["Target country"].map(countries)

    # sort on source country, date and theme
    df = df.sort_values(by=["Date", "Source country", "Target country", "Theme"])

    # remove missing source country values
    df = df[df["Source country"] != ""]

    return df

def pivot_df(df):
    # pivot the table
    df_pivot = df.pivot_table(index=["Date", "Source country", "Target country"], columns=["Theme"], values="Intensity").reset_index()

    # set the "Date" as the index
    df_pivot = df_pivot.set_index("Date")

    return df_pivot

Then we make a function to scrape the data in chunks, and save the chunks in intermediate files. We will make the chunks using the themes, as it is easier to prevent overlap in that way.

In [28]:
# setup parameters
countries_final_set = [str(item) for item in countries.keys()]
themes_final_set = themes

start_date = "20170101010000"
end_date = "20240301010000"

chunk_size = 10

In [31]:
offset = 0
chunk = themes_final_set[:chunk_size]

In [4]:
scraped_chunk = scrape_gdelt(chunk, countries_final_set, start_date, end_date, verbose=1)

In [23]:
cl_df = clean_df(scraped_chunk)
pivot_df = pivot_df(cl_df)
pivot_df.to_csv(f"{gdelt_path}scraped_all/themes_{offset}_to_{offset+chunk_size}.csv")

Theme,Source country,Target country,ALL,CRISISLEX_CRISISLEXREC,LEADER,USPEC_POLITICS_GENERAL1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-02,Afghanistan,Austria,0.6557,0.3279,0.0,0.0
2017-01-02,Afghanistan,Azerbaijan,0.0,0.0,0.0,0.0
2017-01-02,Afghanistan,The Bahamas,0.0,0.0,,
2017-01-02,Albania,Austria,1.4085,0.7042,0.3521,0.7042
2017-01-02,Albania,Azerbaijan,0.0,0.0,0.0,0.0


# Filtering out empty themes

In [35]:
import glob

# Get a list of all CSV files in the /scraped_all/ folder
csv_files = glob.glob(f'{gdelt_path}/scraped_all/*.csv')

# Read in all CSV files and store them in a list
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# make into a dictionary with theme as key and passed as value
passed_dict = dict(zip(combined_df["Theme"], combined_df["Passed"]))

# filter out items in themed that have True value in passed_dict
themes_final_set = [theme for theme in themes[:len(passed_dict)] if not passed_dict[theme]]

# saves themes_final_set to pickle
with open(f'{gdelt_path}/scarped_all/themes_final_set.pickle', 'wb') as f:
    pickle.dump(themes_final_set, f)