In [20]:
import pandas as pd
import urllib.parse
import urllib.error
import pickle
import time
import sys
import glob
from IPython.display import clear_output
import geopandas as gpd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import plotly.express as px
import pycountry
import numpy as np
import kaleido
import plotly.graph_objects as go

countries_codes_path = '../../data/auxilary_data/countries_iso_fips_capitals.csv'

In [2]:
country_to_country_all = pd.read_csv('../../data/GDELT/saved_data/country_to_country_all.csv')

In [3]:
countries_codes = pd.read_csv(countries_codes_path)
# strip trailing whitespaces

countries_codes['GDELT_name'] = countries_codes['GDELT_name'].str.strip()
gdelt_name_to_ISO_dict = dict(zip(countries_codes['GDELT_name'], countries_codes['ISO_alpha3']))

In [4]:
combined_f = country_to_country_all.drop(columns = ["Date", "Unnamed: 2"])
final_df = combined_f.groupby(["Target country"]).mean().sort_values(by = "Target country")
final_df.head(20)

Unnamed: 0_level_0,Afghanistan,Albania,Algeria,Angola,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahrain,...,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
Target country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,61.153794,0.722672,0.407648,0.23393,0.123163,1.084218,0.690159,0.186505,3.598944,0.778591,...,1.125398,0.818044,0.913777,0.240189,4.074993,0.431421,0.033245,0.58967,0.06032,0.812661
Albania,0.097081,29.539962,0.046415,0.041323,0.016827,0.186553,0.033172,0.086125,0.395863,0.060133,...,0.098717,0.104755,0.070581,0.027824,0.053563,0.037084,0.002799,0.023954,0.019373,0.039245
Algeria,0.260947,0.084421,39.857927,0.356128,0.073552,0.08239,0.042973,0.097514,0.22864,0.656803,...,0.693201,0.115633,0.097019,0.135504,0.097806,0.259118,0.021097,0.518425,0.622939,0.670658
American Samoa,0.002766,,0.001862,0.001263,0.000168,0.000647,0.010499,0.002519,0.000508,0.000562,...,0.003072,0.007657,0.022266,,,0.000184,0.000111,0.00085,,0.003364
Andorra,0.011695,0.033549,0.012754,0.008019,0.051678,0.036054,0.005952,0.041431,0.01654,0.007418,...,0.011919,0.023211,0.009281,0.055879,0.01152,0.104778,0.000376,0.007867,0.021544,0.005676
Angola,0.053214,9.4e-05,0.354817,56.21332,0.006143,0.013168,0.034119,0.021003,0.04081,0.043889,...,0.149222,0.069689,0.064619,0.010566,0.020739,0.021818,0.005245,0.043649,1.465438,1.06115
Anguilla,0.305449,0.02114,1.073368,0.011574,0.425975,0.172058,1.17379,0.197281,0.128708,0.271839,...,0.382953,0.509876,0.947376,0.160218,0.331401,0.482181,0.227008,1.271225,0.193495,0.237012
Antigua and Barbuda,0.005622,0.000539,8.1e-05,0.000316,0.000407,0.000476,0.006967,0.002088,0.001488,0.002784,...,0.011168,0.016603,0.016947,0.000334,,0.007617,0.000933,0.000215,0.015652,0.006852
Argentina,0.31353,0.317176,0.672496,0.436595,31.8709,0.50866,0.458107,0.340584,0.272012,0.463743,...,0.811664,0.394575,0.644129,12.450069,0.158808,5.24257,0.052392,0.719066,0.150011,0.408991
Armenia,0.75221,0.151478,0.039898,0.0182,0.0877,58.90414,0.038121,0.116099,12.053696,0.107673,...,0.172629,0.064859,0.067983,0.063246,0.840127,0.033517,0.005488,0.058877,0.027316,0.043923


In [5]:
# make plot of countries that the a country talks about the most
def covers_most(df, country, log_scale=True):
    df_copy = df.copy()

    df_copy["ISO"] = df_copy.index.map(gdelt_name_to_ISO_dict)
    df_copy = df_copy.reset_index()

    if log_scale:
        df_copy[country] = np.log(df_copy[country])

    fig = px.choropleth(df_copy, locations='ISO', color=country, hover_name='Target country',
                        projection='natural earth', title=f'Countries that {country} covers the most')
    fig.show()

In [None]:
covers_most(final_df, "United States", log_scale=False)

In [6]:
# make function to show what countries a country is covered by most
def most_covered_by(df, country, log_scale=True):
    df_copy = df.copy()
    
    # switch colums and rows
    transpose = df_copy.T

    transpose["ISO"] = transpose.index.map(gdelt_name_to_ISO_dict)
    transpose = transpose.reset_index()

    if log_scale:
        transpose[country] = np.log(transpose[country])
    
    fig = px.choropleth(transpose, locations='ISO', color=country,
                        projection='natural earth')
    # set legend title
    fig.update_layout(coloraxis_colorbar=dict(title=f'% of coverage that mentions {country}'))
    # set subtitle
    fig.update_layout(title=f'Percentage of news coverage that explicitly mentions {country}, per country')
 
    fig.show()


In [None]:
most_covered_by(final_df, "Spain", log_scale=False)

Finally we make a map of which countries are covered most by other countries

In [7]:
copied_df = final_df.copy()

# set the entry of afghanistan in the afghanistan column to 0
for country in copied_df.columns:
    copied_df.loc[country, country] = None
copied_df["average"] = copied_df.mean(axis=1)

copied_df["ISO"] = copied_df.index.map(gdelt_name_to_ISO_dict)
cp_reset = copied_df.reset_index()

In [None]:
fig = px.choropleth(cp_reset, locations='ISO', color="average", 
                    color_continuous_scale='RdBu_r', hover_name='Target country',
                    projection='natural earth', title=f'Countries most covered in global news, as percent of global news coverage')

fig.update_layout(
    template='plotly_dark',
    width=1200,
    height=600,
    coloraxis_colorbar=dict(title=f'%'),
    margin={"r":0,"t":70,"l":0,"b":10},
    title={
    'text': f'<b>Countries most covered in global news, as percent of global news coverage</b>',
    'y':0.93,  # Adjust the vertical position of the title
    'x':0.5,  # Center the title horizontally
    'xanchor': 'center',
    'yanchor': 'top',
    'font': {'size': 18}},  # Adjust font size as needed,
)



fig.write_image("../../figs/global_news_coverage.png", width=1200, height=600)

fig.show()

Then we make a graph like the one above, but we draw lines from each country to each country, making them thicker as the coverage increases. For that we need the average coordinates of each country:

In [8]:
country_coordinates = pd.read_csv("https://gist.githubusercontent.com/tadast/8827699/raw/61b2107766d6fd51e2bd02d9f78f6be081340efc/countries_codes_and_coordinates.csv")

country_coordinates.head()

# remove quotes from both sides of all strings in the dataframe
country_coordinates = country_coordinates.map(lambda x: x.replace('"', "").strip())

We can then make a function to find the coordinates from the alpha_3 ISO code:

In [9]:
def get_coordinates(alpha_3):
    try:
        return country_coordinates[country_coordinates['Alpha-3 code'] == alpha_3][['Latitude (average)', 'Longitude (average)']].values[0]
    except:
        return [None, None]

And then we can make a plot:

In [28]:
def plot_coverage_lines_country(cp_reset, source_country):

    fig = go.Figure()

    # don't display the legend
    fig.update_layout(showlegend=False)

    fig.update_layout(
        template='plotly_dark',
        width=1200,
        height=600,
        coloraxis_colorbar=dict(title=f'%'),
        margin={"r":0,"t":70,"l":0,"b":10},
        title={
        'text': f'<b>News coverage network of {source_country}</b>',
        'y':0.93,  # Adjust the vertical position of the title
        'x':0.5,  # Center the title horizontally
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 18}},  # Adjust font size as needed,
    )

    lines = []

    source_country_coords = get_coordinates(gdelt_name_to_ISO_dict[source_country])

    for i, target_country in enumerate(cp_reset['Target country']):
        if target_country != source_country:
            intensity = 1.5 * float(cp_reset[cp_reset["Target country"] == target_country][source_country].iloc[0])

            print(f"Processing {i+1}/{len(cp_reset)}", end="\r")
            try:
                target_country_coords = get_coordinates(gdelt_name_to_ISO_dict[target_country])
            except:
                pass

            if target_country_coords[0]:
                try:
                    lines.append(
                        go.Scattergeo(
                            lon = [source_country_coords[1], target_country_coords[1]],
                            lat = [source_country_coords[0], target_country_coords[0]],
                            mode = 'lines',
                            line = dict(width = intensity, color = 'white'),
                            opacity = 0.2,
                        )
                    )
                except:
                    pass
    # Add lines to the figure
    for i, line in enumerate(lines):
        print(f"Adding lines {i+1}/{len(lines)}", end="\r")
        fig.add_trace(line)

    # fig.write_image("../../figs/global_news_lines_USA.png", width=1200, height=600)

    fig.show()

In [29]:
def plot_coverage_lines_world(cp_reset, color):
    fig = go.Figure()

    if color == "black":
        line_color = "white"
    else:
        line_color = "black"

    if color == "black":
        fig.update_layout(template='plotly_dark')

    fig.update_layout(
        showlegend=False,
        width=1200,
        height=600,
        coloraxis_colorbar=dict(title=f'%'),
        margin={"r":0,"t":70,"l":0,"b":10},
        title={
        'text': f'<b>Global news coverage network</b>',
        'y':0.93,  # Adjust the vertical position of the title
        'x':0.5,  # Center the title horizontally
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 18}},  # Adjust font size as needed,
    )

    lines = []

    for i, target_country in enumerate(cp_reset['Target country']):
        print(f"Processing {i+1}/{len(cp_reset)}", end="\r")
        try:
            target_country_coords = get_coordinates(gdelt_name_to_ISO_dict[target_country])
        except:
            pass
        if target_country_coords[0]:
            for source_country in cp_reset.columns[1:]:
                if source_country != target_country and source_country != "ISO":
                    try:
                        source_country_coords = get_coordinates(gdelt_name_to_ISO_dict[source_country])
                    except:
                        pass
                    if source_country_coords[0]:
                        #print(target_country, source_country)
                        intensity = float(cp_reset[cp_reset["Target country"] == target_country][source_country].iloc[0])

                        try:
                            if intensity > 10:
                                lines.append(
                                    go.Scattergeo(
                                        lon = [source_country_coords[1], target_country_coords[1]],
                                        lat = [source_country_coords[0], target_country_coords[0]],
                                        mode = 'lines',
                                        line = dict(width = intensity,color = line_color),
                                        opacity = 0.2,
                                    ))
                            elif intensity > 5:
                                lines.append(
                                    go.Scattergeo(
                                        lon = [source_country_coords[1], target_country_coords[1]],
                                        lat = [source_country_coords[0], target_country_coords[0]],
                                        mode = 'lines',
                                        line = dict(width = intensity / 3,color = line_color),
                                        opacity = 0.2,
                                    ))
                            elif intensity > 1:
                                lines.append(
                                    go.Scattergeo(
                                        lon = [source_country_coords[1], target_country_coords[1]],
                                        lat = [source_country_coords[0], target_country_coords[0]],
                                        mode = 'lines',
                                        line = dict(width = 0.1,color = line_color),
                                        opacity = 0.15,
                                    ))
                            elif intensity > 0.5:
                                lines.append(
                                    go.Scattergeo(
                                        lon = [source_country_coords[1], target_country_coords[1]],
                                        lat = [source_country_coords[0], target_country_coords[0]],
                                        mode = 'lines',
                                        line = dict(width = 0.05,color = line_color),
                                        opacity = 0.1,
                                    )
                            )
                        except:
                            pass

    # Add lines to the figure
    for i, line in enumerate(lines):
        print(f"Adding lines {i+1}/{len(lines)}", end="\r")
        fig.add_trace(line)

    # fig.write_image("../../figs/global_news_lines.png", width=1200, height=600)

    fig.show()

In [30]:
plot_coverage_lines_world(cp_reset, "black")

Adding lines 7635/7635

In [34]:
plot_coverage_lines_country(cp_reset, "India")

Adding lines 232/232