# Read in country to country coverage intensity for all themes combined

### 1. Importing libraries, functions and data

In [7]:
import pandas as pd
import urllib.parse
import urllib.error
import pickle
import time
import sys
import glob
from IPython.display import clear_output

# set relative path
countries_capitals_path = '../../data/auxilary_data/countries_capitals.csv'
data_folder_path = '../../data/GDELT/'
themes_path = '../../data/auxilary_data/themes_final_set.pickle'

In [2]:
countries_capitals = pd.read_csv(countries_capitals_path)

# make a countries dictionary with FIPS as key
countries = countries_capitals.set_index('FIPS')['Country'].to_dict()

# read in themes
with open(themes_path, 'rb') as f:
    themes_all = pickle.load(f)

We import the functions from our scraping_gdelt notebook:

In [3]:
def querybuilder(dict):
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc?"
    url = base_url + "&".join([f"{key}={value}" for key, value in dict.items()])
    url = urllib.parse.quote(url, safe='():/?&=').replace("&theme=", "%20theme:")
    return url

def get_gdelt_data(theme, country, country_query, start_date, end_date, verbose=0):
    
    if theme == "ALL":
        dict = {
            "query": country_query,
            "mode": "TimelineSourceCountry",
            "startdatetime": start_date,
            "enddatetime": end_date,
            "format": "csv",
            "timezoom" : "yes",
        }
    else:
        dict = {
            "query": country_query,
            "theme": theme,
            "mode": "TimelineSourceCountry",
            "startdatetime": start_date,
            "enddatetime": end_date,
            "format": "csv",
            "timezoom" : "yes",
        }  
        
    url = querybuilder(dict)
    
    if verbose >= 2:
        print(url.replace("csv", "html"))
    try:
        df = pd.read_csv(url)
        return df
    except pd.errors.EmptyDataError:
        if verbose >= 2:
            print("passed")
        pass

def format_seconds(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    if hours > 0:
        return f"{hours} hours, {minutes} minutes, {round(seconds)} seconds"
    elif minutes > 0:
        return f"{minutes} minutes, {round(seconds)} seconds"
    else:
        return f"{round(seconds, 2)} seconds"

def scrape_gdelt(themes, countries, country_query, start_date, end_date, save_int, all=False, verbose=0):
    df_list = []
    
    if all:
        themes.append("ALL")

    # track time left
    total_items = len(themes) * len(countries)
    count = 0
    api_call_times = []
    passed_total = 0
    if verbose >= 1:
        print(f"Total queries: {total_items}")
    # set current time
    start_time = time.time()
                
    saved = 0

    for theme in themes:
        for country in countries:
            if verbose >= 2:
                print(f"Scraping {theme} in {country}")
            
            # set time when api was called last
            last_api_call_time = time.time()

            df = get_gdelt_data(theme, country, country_query, start_date, end_date, verbose=verbose)
            
            api_call_times.append(time.time() - last_api_call_time)

            if df is not None:
                df["Date"] = pd.to_datetime(df["Date"])
                df["Series"] = df["Series"].str.replace(" Volume Intensity", "")
                df_m = df.groupby([pd.Grouper(key='Series'), pd.Grouper(key='Date', freq='M')])['Value'].mean().reset_index()
                df_m.columns = ["Source country", "Month", "Intensity"]
                df_m['Theme'] = theme
                df_m['Target country'] = country
                
                df_list.append(df_m)
            else:
                passed_total += 1

            count += 1

            if count % save_int == 0:
                df = pd.concat(df_list)
                df.to_csv(f'{data_folder_path}scraped_all/gdelt_data_{count-save_int}_to_{count}.csv')
                df_list = []
                saved += 1
            
            elapsed_time = time.time() - start_time
            items_per_second = count / elapsed_time
            seconds_left = (total_items - count) / items_per_second
            
            progress_str = f"Processed {count}/{total_items} queries. {round(items_per_second, 2)} Query/s. Average api time: {round(sum(api_call_times)/len(api_call_times),2)}s. On theme {theme} for {country}                           \
                            \nElapsed time: {elapsed_time:.2f} seconds. Estimated time left: {format_seconds(seconds_left)}. Saved: {saved}"
            sys.stdout.write('\x1b[A\r' + progress_str)
            sys.stdout.flush()
            
            # avoid hitting API rate limit
            if time.time() - last_api_call_time < 5:
                time.sleep(5 - (time.time() - last_api_call_time))
                
    df = pd.concat(df_list)
    df.to_csv(f'{data_folder_path}scraped_all/gdelt_data_{count-save_int}_to_{count}.csv')
    df_list = []
    saved += 1

### 2. Scraping the data

First we set up the parameters for our scraping operation.

In [4]:
countries_all = [str(item) for item in countries.keys()]
themes_all = []

start_date = "20170101010000"
end_date = "20240301010000"

And then we call the scraping function, which saves csv's every 20 queries

In [None]:
scrape_gdelt(themes_all, countries_all, start_date, end_date, save_int=20, all=True, verbose=1)

### 3. Read the saves csv files back in

Since the data is too large for pandas (and our kernel) to handle at once, we have to read in and save the csv in 2 batches. We first make a function to wrangle the data in the desired format:

In [4]:
def wrangle_batch(batch):
    if "Unnamed: 0" in batch.columns:
        batch.drop(columns=["Unnamed: 0"], inplace=True)
    batch.columns = ["Date", "Source country", "Intensity", "Theme", "Target country"]
    batch["Source country"] = batch["Source country"].str.replace(" Volume Intensity", "")
    batch["Target country"] = batch["Target country"].map(countries)
    batch_pivot = batch.pivot_table(index=["Date", "Target country"], columns=["Source country"], values="Intensity").reset_index()
    return batch_pivot

And then read in all the csv's in folder

In [4]:
# for csv file in scraped_all folder
csv_files = glob.glob(f'{data_folder_path}scraped_all/*.csv')

# first batch
dataframes = []
for i, file in enumerate(csv_files[6:]):
    print(f"Opening df {i} from {file}...", end="\r")
    df = pd.read_csv(file)
    batch = wrangle_batch(df)
    dataframes.append(batch)

data = pd.concat(dataframes, ignore_index=True)
data.to_csv(f"{data_folder_path}/scraped_all/combined_1.csv", index=False)

# second batch
dataframes = []
for i, file in enumerate(csv_files[6:]):
    print(f"Opening df {i} from {file}...", end="\r")
    df = pd.read_csv(file)
    batch = wrangle_batch(df)
    dataframes.append(batch)

data = pd.concat(dataframes, ignore_index=True)
data.to_csv(f"{data_folder_path}/scraped_all/combined_1.csv", index=False)

Opening df 5 from scraped_all/gdelt_data_80_to_100.csv....

Finally to combine the two we read them in again, and save them to one combined csv file. We then delete the old csv's to save space (hence they are not in the repository anymore)

In [8]:
part_1 = pd.read_csv(f"{data_folder_path}/scraped_all/combined_1.csv")
part_2 = pd.read_csv(f"{data_folder_path}/scraped_all/combined_2.csv")

combined = pd.concat([part_1, part_2], ignore_index=True)

In [11]:
combined.to_csv(f"{data_folder_path}/saved_data/country_to_country_all.csv", index=False)

## 4. Plot relationships between countries

In [7]:
combined = pd.read_csv(f"{data_folder_path}/saved_data/country_to_country_all.csv")

In [23]:
combined_f = combined.drop(columns = ["Date", "Unnamed: 2"])

In [35]:
combined_f = combined_f.groupby(["Target country"]).mean().sort_values(by = "Target country")
combined_f.head()

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
Target country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,61.153794,0.722672,0.407648,0.23393,0.123163,1.084218,0.690159,0.186505,3.598944,0.778591,...,1.125398,0.818044,0.913777,0.240189,4.074993,0.431421,0.033245,0.58967,0.06032,0.812661
Albania,0.097081,29.539962,0.046415,0.041323,0.016827,0.186553,0.033172,0.086125,0.395863,0.060133,...,0.098717,0.104755,0.070581,0.027824,0.053563,0.037084,0.002799,0.023954,0.019373,0.039245
Algeria,0.260947,0.084421,39.857927,0.356128,0.073552,0.08239,0.042973,0.097514,0.22864,0.656803,...,0.693201,0.115633,0.097019,0.135504,0.097806,0.259118,0.021097,0.518425,0.622939,0.670658
American Samoa,0.002766,,0.001862,0.001263,0.000168,0.000647,0.010499,0.002519,0.000508,0.000562,...,0.003072,0.007657,0.022266,,,0.000184,0.000111,0.00085,,0.003364
Andorra,0.011695,0.033549,0.012754,0.008019,0.051678,0.036054,0.005952,0.041431,0.01654,0.007418,...,0.011919,0.023211,0.009281,0.055879,0.01152,0.104778,0.000376,0.007867,0.021544,0.005676


In [39]:
combined_f["Albania"].sum()

120.6178757656968

## Scraping for real

In [44]:
countries_all = ["AF"]
country_query = "(Afghanistan OR Kabul)"
themes = themes_all[:1000]

start_date = "20170101010000"
end_date = "20240301010000"

In [45]:
scrape_gdelt(themes, countries_all, country_query, start_date, end_date, save_int=20, all=True, verbose=1)

Processed 1001/1001 queries. 0.17 Query/s. Average api time: 5.33s. On theme ALL for AF                                                                                                              
Elapsed time: 6058.23 seconds. Estimated time left: 0.0 seconds. Saved: 50

### Reading the saved files back in:

In [6]:
# for csv file in scraped_all folder
csv_files = glob.glob(f'{data_folder_path}scraped_all/*.csv')

# first batch
dataframes = []
for i, file in enumerate(csv_files):
    print(f"Opening df {i} from {file}...", end="\r")
    df = pd.read_csv(file)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)

Opening df 50 from ../../data/GDELT/scraped_all/gdelt_data_80_to_100.csv.....

In [18]:
# drop rows where "source country" is nan
data_test = data.dropna(subset=["Source country"])

# drop "Unnamed: 0" colum
data_test = data_test.drop(columns=["Unnamed: 0"])

# pivot table by source country and date, and move theme to the columns
data_pivot = data_test.pivot_table(index=["Source country", "Month"], columns="Theme", values="Intensity").reset_index()

data_pivot.head()

Theme,Source country,Month,ACT_MAKESTATEMENT,AFFECT,AGRICULTURE,ALL,ALLIANCE,APPOINTMENT,ARMEDCONFLICT,ARREST,...,WB_862_GROWTH_POLES_AND_ECONOMIC_ZONES,WB_866_CONNECTIVITY_AND_LAGGING_REGIONS,WB_895_MINING_SYSTEMS,WB_936_ALTERNATIVE_DISPUTE_RESOLUTION,WB_938_MEDIATION,WB_939_NEGOTIATION,WB_962_INTERNATIONAL_LAW,WB_990_DRAINAGE,WMD,WOUND
0,Afghanistan,2017-01-31,4.577977,12.485877,3.965973,81.27809,4.71588,2.67661,39.791767,12.224207,...,0.137977,0.170183,1.13594,3.20704,0.173097,2.968733,2.17245,0.01239,1.31433,15.00537
1,Afghanistan,2017-02-28,2.486725,13.703154,4.244625,83.369568,3.918643,3.197586,40.634196,9.270464,...,0.140729,0.257861,1.113689,2.359482,0.188971,2.11765,2.591007,0.068307,0.413957,14.862957
2,Afghanistan,2017-03-31,3.565023,11.328845,2.708561,81.810281,2.570168,3.566481,40.308161,7.192197,...,0.241387,0.234971,1.159519,1.989203,0.201635,1.693845,1.299484,0.0,0.482413,13.378529
3,Afghanistan,2017-04-30,0.836593,4.97662,2.680933,76.024897,2.370847,3.09509,31.743073,5.682147,...,0.068263,0.172043,0.80793,2.175577,0.266987,1.94572,1.602663,0.0,0.669583,8.98629
4,Afghanistan,2017-05-31,1.158506,4.694287,1.436145,75.717387,3.297613,2.588465,32.410197,6.139355,...,0.047552,0.295735,1.097581,2.263526,0.138181,2.125339,1.296877,0.0,0.188823,10.708432


In [21]:
# save data_pivot to csv
data_pivot.to_csv(f"{data_folder_path}/saved_data/afghanistan_full.csv", index=False)

### Collecting article counts data for normalization of the data

In order to weigh the contribution of different countries, we need to normalize the data. For this we scrape the number of articles for each country that mention afghanistan, and then use this to normalize the data.

## Auxiliary functions

In [58]:
def querybuilder(dict):
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc?"
    url = base_url + "&".join([f"{key}={value}" for key, value in dict.items()])
    url = urllib.parse.quote(url, safe='():/?&=').replace("&theme=", "%20theme:")
    return url

def get_gdelt_data_raw(query, sourcecountry, start_date, end_date, verbose=0):
    
    dict = {
        "query": query,
        "sourcecountry" : sourcecountry,
        "mode": "TimelineVolRaw",
        "timelinesmooth": 0,
        "startdatetime": start_date,
        "enddatetime": end_date,
        "format": "csv",
        "timezoom" : "yes",
    }  
        
    url = querybuilder(dict).replace("&sourcecountry=", "%20sourcecountry:")

    # print(url.replace("csv", "html"))

    try:
        df = pd.read_csv(url)
        return df
    except pd.errors.EmptyDataError:
        if verbose >= 2:
            print("passed")
        pass

    return False

In [50]:
country_codes = pd.read_csv("../../data/auxilary_data/countries_codes_full.csv")
country_codes.head()

fips = country_codes["FIPS"].tolist()

In [60]:
df_list = []

start_date = "20170101010000"
end_date = "20240301010000"

count = 0

for i, fip in enumerate(fips):
    print(f"On {i}/{len(fips)}", end="\r")
    df = get_gdelt_data_raw("Afghanistan", fip, start_date, end_date, verbose=1)
    if df is not False:
        df["FIPS"] = fip
        df_list.append(df)
    # time.sleep(4)

On 0/236

On 235/236

In [61]:
# append al df's to one
df_full = pd.concat(df_list)

In [70]:
df_full.head()

Unnamed: 0,Date,Series,Value,FIPS
0,2017-01-02,Article Count,243,AF
1,2017-01-02,Total Monitored Articles,592774,AF
2,2017-01-03,Article Count,208,AF
3,2017-01-03,Total Monitored Articles,757916,AF
4,2017-01-04,Article Count,204,AF


In [71]:
df_full["Date"] = pd.to_datetime(df_full["Date"])
df_full_pivot = df_full.pivot_table(index=["FIPS", "Date"], columns="Series", values="Value").reset_index()

df_full_m = df_full_pivot.groupby([pd.Grouper(key='FIPS'), pd.Grouper(key='Date', freq='M')])['Value'].mean().reset_index()
df_full_m.head()

Unnamed: 0,FIPS,Date,Value
0,AA,2017-01-31,373268.683333
1,AA,2017-02-28,381315.178571
2,AA,2017-03-31,370052.951613
3,AA,2017-04-30,334374.966667
4,AA,2017-05-31,321396.33871


In [72]:
df_full.head()

Unnamed: 0,Date,Series,Value,FIPS
0,2017-01-02,Article Count,243,AF
1,2017-01-02,Total Monitored Articles,592774,AF
2,2017-01-03,Article Count,208,AF
3,2017-01-03,Total Monitored Articles,757916,AF
4,2017-01-04,Article Count,204,AF


In [73]:
# pivot df_full so that entry in "Series" columns become their own columns
df_full_pivot = df_full.pivot_table(index=["FIPS", "Date"], columns="Series", values="Value").reset_index()
df_full_pivot.head()

Series,FIPS,Date,Article Count,Total Monitored Articles
0,AA,2017-01-02,0.0,592774.0
1,AA,2017-01-03,0.0,757916.0
2,AA,2017-01-04,0.0,822862.0
3,AA,2017-01-05,0.0,830028.0
4,AA,2017-01-06,0.0,766969.0


In [86]:
df_full_m = df_full_pivot.groupby([pd.Grouper(key='FIPS'), pd.Grouper(key='Date', freq='M')])[['Article Count', 'Total Monitored Articles']].sum().reset_index()
df_full_m.columns = ["FIPS", "Month", "Article Count", "Total Monitored Articles"]
df_full_m.head()

Unnamed: 0,FIPS,Month,Article Count,Total Monitored Articles
0,AA,2017-01-31,0.0,22396121.0
1,AA,2017-02-28,0.0,21353650.0
2,AA,2017-03-31,0.0,22943283.0
3,AA,2017-04-30,0.0,20062498.0
4,AA,2017-05-31,0.0,19926573.0


In [89]:
df_full_m[df_full_m["Month"] == "2021-01-31"]

Unnamed: 0,FIPS,Month,Article Count,Total Monitored Articles
48,AA,2021-01-31,0.0,10883203.0
135,AC,2021-01-31,0.0,10883203.0
222,AE,2021-01-31,174.0,10883203.0
309,AF,2021-01-31,1925.0,10883203.0
396,AG,2021-01-31,17.0,10883203.0
...,...,...,...,...
18318,WS,2021-01-31,3.0,10883203.0
18405,WZ,2021-01-31,0.0,10883203.0
18492,YM,2021-01-31,25.0,10883203.0
18579,ZA,2021-01-31,3.0,10883203.0


In [91]:
df_full_m.to_csv(f"../../data/GDELT/saved_data/afghanistan_full_counts.csv", index=False)