# Read in country to country coverage intensity for all themes combined

### 1. Importing libraries, functions and data

In [2]:
import pandas as pd
import urllib.parse
import urllib.error
import pickle
import time
import sys
import glob
from IPython.display import clear_output

# set relative path
countries_capitals_path = '../../data/auxilary_data/countries_capitals.csv'
data_folder_path = '../../data/GDELT/'

In [4]:
countries_capitals = pd.read_csv(countries_capitals_path)

# make a countries dictionary with FIPS as key
countries = countries_capitals.set_index('FIPS')['Country'].to_dict()

We import the functions from our scraping_gdelt notebook:

In [5]:
def querybuilder(dict):
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc?"
    url = base_url + "&".join([f"{key}={value}" for key, value in dict.items()])
    url = urllib.parse.quote(url, safe='():/?&=').replace("&theme=", "%20theme:")
    return url

def get_gdelt_data(theme, country, start_date, end_date, verbose=0):
    
    if theme == "ALL":
        dict = {
            "query": query_dict[country],
            "mode": "TimelineSourceCountry",
            "startdatetime": start_date,
            "enddatetime": end_date,
            "format": "csv",
            "timezoom" : "yes",
        }
    else:
        dict = {
            "query": query_dict[country],
            "theme": theme,
            "mode": "TimelineSourceCountry",
            "startdatetime": start_date,
            "enddatetime": end_date,
            "format": "csv",
            "timezoom" : "yes",
        }  
        
    url = querybuilder(dict)
    
    if verbose >= 2:
        print(url.replace("csv", "html"))
    try:
        df = pd.read_csv(url)
        return df
    except pd.errors.EmptyDataError:
        if verbose >= 2:
            print("passed")
        pass

def format_seconds(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    if hours > 0:
        return f"{hours} hours, {minutes} minutes, {round(seconds)} seconds"
    elif minutes > 0:
        return f"{minutes} minutes, {round(seconds)} seconds"
    else:
        return f"{round(seconds, 2)} seconds"

def scrape_gdelt(themes, countries, start_date, end_date, save_int, all=False, verbose=0):
    df_list = []
    
    if all:
        themes.append("ALL")

    # track time left
    total_items = len(themes) * len(countries)
    count = 0
    api_call_times = []
    passed_total = 0
    if verbose >= 1:
        print(f"Total queries: {total_items}")
    # set current time
    start_time = time.time()
                
    saved = 0

    for theme in themes:
        for country in countries:
            if verbose >= 2:
                print(f"Scraping {theme} in {country}")
            
            # set time when api was called last
            last_api_call_time = time.time()

            df = get_gdelt_data(theme, country, start_date, end_date, verbose=verbose)
            
            api_call_times.append(time.time() - last_api_call_time)

            if df is not None:
                df['theme'] = theme
                df['country'] = country
                df_list.append(df)
            else:
                passed_total += 1

            count += 1

            if count % save_int == 0:
                df = pd.concat(df_list)
                df.to_csv(f'{data_folder_path}scraped_all/gdelt_data_{count-save_int}_to_{count}.csv')
                df_list = []
                saved += 1
            
            elapsed_time = time.time() - start_time
            items_per_second = count / elapsed_time
            seconds_left = (total_items - count) / items_per_second
            
            progress_str = f"Processed {count}/{total_items} queries. {round(items_per_second, 2)} Query/s. Average api time: {round(sum(api_call_times)/len(api_call_times),2)}s. On theme {theme} for {country}                           \
                            \nElapsed time: {elapsed_time:.2f} seconds. Estimated time left: {format_seconds(seconds_left)}. Saved: {saved}"
            sys.stdout.write('\x1b[A\r' + progress_str)
            sys.stdout.flush()
            
            # avoid hitting API rate limit
            if time.time() - last_api_call_time < 5:
                time.sleep(5 - (time.time() - last_api_call_time))
                
    df = pd.concat(df_list)
    df.to_csv(f'{data_folder_path}scraped_all/gdelt_data_{count-save_int}_to_{count}.csv')
    df_list = []
    saved += 1

### 2. Scraping the data

First we set up the parameters for our scraping operation.

In [6]:
countries_all = [str(item) for item in countries.keys()]
themes_all = []

start_date = "20170101010000"
end_date = "20240301010000"

And then we call the scraping function, which saves csv's every 20 queries

In [None]:
scrape_gdelt(themes_all, countries_all, start_date, end_date, save_int=20, all=True, verbose=1)

### 3. Read the saves csv files back in

Since the data is too large for pandas (and our kernel) to handle at once, we have to read in and save the csv in 2 batches. We first make a function to wrangle the data in the desired format:

In [14]:
def wrangle_batch(batch):
    if "Unnamed: 0" in batch.columns:
        batch.drop(columns=["Unnamed: 0"], inplace=True)
    batch.columns = ["Date", "Source country", "Intensity", "Theme", "Target country"]
    batch["Source country"] = batch["Source country"].str.replace(" Volume Intensity", "")
    batch["Target country"] = batch["Target country"].map(countries)
    batch_pivot = batch.pivot_table(index=["Date", "Target country"], columns=["Source country"], values="Intensity").reset_index()
    return batch_pivot

And then read in all the csv's in folder

In [4]:
# for csv file in scraped_all folder
csv_files = glob.glob(f'{data_folder_path}scraped_all/*.csv')

# first batch
dataframes = []
for i, file in enumerate(csv_files[6:]):
    print(f"Opening df {i} from {file}...", end="\r")
    df = pd.read_csv(file)
    batch = wrangle_batch(df)
    dataframes.append(batch)

data = pd.concat(dataframes, ignore_index=True)
data.to_csv(f"{data_folder_path}/scraped_all/combined_1.csv", index=False)

# second batch
dataframes = []
for i, file in enumerate(csv_files[6:]):
    print(f"Opening df {i} from {file}...", end="\r")
    df = pd.read_csv(file)
    batch = wrangle_batch(df)
    dataframes.append(batch)

data = pd.concat(dataframes, ignore_index=True)
data.to_csv(f"{data_folder_path}/scraped_all/combined_1.csv", index=False)

Opening df 5 from scraped_all/gdelt_data_80_to_100.csv....

Finally to combine the two we read them in again, and save them to one combined csv file. We then delete the old csv's to save space (hence they are not in the repository anymore)

In [8]:
part_1 = pd.read_csv(f"{data_folder_path}/scraped_all/combined_1.csv")
part_2 = pd.read_csv(f"{data_folder_path}/scraped_all/combined_2.csv")

combined = pd.concat([part_1, part_2], ignore_index=True)

In [11]:
combined.to_csv(f"{data_folder_path}/saved_data/country_to_country_all.csv", index=False)

## 4. Plot relationships between countries

In [7]:
combined = pd.read_csv(f"{data_folder_path}/saved_data/country_to_country_all.csv")

In [23]:
combined_f = combined.drop(columns = ["Date", "Unnamed: 2"])

In [35]:
combined_f = combined_f.groupby(["Target country"]).mean().sort_values(by = "Target country")
combined_f.head()

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
Target country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,61.153794,0.722672,0.407648,0.23393,0.123163,1.084218,0.690159,0.186505,3.598944,0.778591,...,1.125398,0.818044,0.913777,0.240189,4.074993,0.431421,0.033245,0.58967,0.06032,0.812661
Albania,0.097081,29.539962,0.046415,0.041323,0.016827,0.186553,0.033172,0.086125,0.395863,0.060133,...,0.098717,0.104755,0.070581,0.027824,0.053563,0.037084,0.002799,0.023954,0.019373,0.039245
Algeria,0.260947,0.084421,39.857927,0.356128,0.073552,0.08239,0.042973,0.097514,0.22864,0.656803,...,0.693201,0.115633,0.097019,0.135504,0.097806,0.259118,0.021097,0.518425,0.622939,0.670658
American Samoa,0.002766,,0.001862,0.001263,0.000168,0.000647,0.010499,0.002519,0.000508,0.000562,...,0.003072,0.007657,0.022266,,,0.000184,0.000111,0.00085,,0.003364
Andorra,0.011695,0.033549,0.012754,0.008019,0.051678,0.036054,0.005952,0.041431,0.01654,0.007418,...,0.011919,0.023211,0.009281,0.055879,0.01152,0.104778,0.000376,0.007867,0.021544,0.005676


In [39]:
combined_f["Albania"].sum()

120.6178757656968