In [165]:
import os
import pandas as pd
import pprint as pprint 
import seaborn as sns
import matplotlib as plt


## sample GDELT Global Knowledge Graph .csv file

In [166]:
sample_gkg_file = '~/gdelt_data/notebook_test_data/20210401001500.gkg.csv'

base_path = os.path.dirname(os.path.abspath(''))
file_path = os.path.join(base_path, sample_gkg_file)

with open(sample_gkg_file, 'rb') as f:
    df = pd.read_csv(f, sep='\t', encoding='utf-8')
df.info()
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 27 columns):
 #   Column                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

GDELT GKK files consists of tab delimited columns, containing entries where info has been successfully parsed from source. 

### This particular .csv consists of 1752 rows across 27 columns

In [167]:
df.shape

(1752, 27)

## Columns correspond to each of the extracted fields contained within GKG 2.1 format:

A full definition for each extracted field, value type, and formatting is found in the associated codebook:

http://data.gdeltproject.org/documentation/GDELT-Global_Knowledge_Graph_Codebook-V2.1.pdf

A handy cheat sheet for quick reference made available by Ralf:

https://mind-node.net/gdelt-gkg-2-1-cheat-sheet/


In [168]:
gkg_fields = ['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER', 'V2SOURCECOMMONNAME', 
              'V2DOCUMENTIDENTIFIER', 'V1COUNTS', 'V2.1COUNTS', 'V1THEMES', 'V2ENHANCEDTHEMES', 
              'V1LOCATIONS', 'V2ENHANCEDLOCATIONS', 'V1PERSONS', 'V2ENHANCEDPERSONS', 'V1ORGANIZATIONS',  
              'V2ENHANCEDORGANIZATIONS', 'V1.5TONE', 'V2.1ENHANCEDDATES', 'V2GCAM', 'V2.1SHARINGIMAGE', 
              'V2.1RELATEDIMAGES', 'V2.1SOCIALIMAGEEMBEDS', 'V2.1SOCIALVIDEOEMBEDS', 'V2.1QUOTATIONS', 
              'V2.1ALLNAMES', 'V2.1AMOUNTS', 'V2.1TRANSLATIONINFO', 'V2EXTRASXML']

## The output below shows the values for each column in the first row of this particular .csv  

'Unnamed: x' values indicate that the GDELT web crawler was unable to derive a value for the specific field from the source document being parsed and as such returns a Null value. 

In [169]:
df.loc[0]

20210401001500-0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

## Column 7 reveals some of the V1THEMES that GDELT was able to parse from this record:
    • semi-colon-delimited

In [170]:
df.iat[0,7]

'TAX_ECON_PRICE;TAX_ETHNICITY;TAX_ETHNICITY_BLACK;WB_678_DIGITAL_GOVERNMENT;WB_694_BROADCAST_AND_MEDIA;WB_133_INFORMATION_AND_COMMUNICATION_TECHNOLOGIES;GENERAL_GOVERNMENT;EPU_POLICY;EPU_POLICY_GOVERNMENT;ECON_INFLATION;WB_1104_MACROECONOMIC_VULNERABILITY_AND_DEBT;WB_442_INFLATION;MEDIA_MSM;USPEC_POLITICS_GENERAL1;TAX_FNCACT;TAX_FNCACT_CHAIRMAN;TAX_FNCACT_KNIGHT;MARITIME_INCIDENT;MARITIME;MANMADE_DISASTER_IMPLIED;EPU_POLICY_REFORM;'

## Column 8 reveals V2ENHANCEDTHEMES
    • Semicolon-delimited blocks, with comma-delimited fields.
    • Contains all GKG themes referenced in the document with character offsets of approximate location 
      where found within the document.
    • Over 300 themes currently recognized. 

In [171]:
df.iat[0,8]

'EPU_POLICY_GOVERNMENT,299;EPU_POLICY_GOVERNMENT,795;EPU_POLICY_GOVERNMENT,1715;EPU_POLICY_GOVERNMENT,2213;TAX_FNCACT_CHAIRMAN,1607;TAX_FNCACT_CHAIRMAN,2071;USPEC_POLITICS_GENERAL1,1593;USPEC_POLITICS_GENERAL1,2062;ECON_INFLATION,353;WB_1104_MACROECONOMIC_VULNERABILITY_AND_DEBT,353;WB_442_INFLATION,353;WB_678_DIGITAL_GOVERNMENT,144;WB_678_DIGITAL_GOVERNMENT,1568;WB_678_DIGITAL_GOVERNMENT,2177;WB_694_BROADCAST_AND_MEDIA,144;WB_694_BROADCAST_AND_MEDIA,1568;WB_694_BROADCAST_AND_MEDIA,2177;WB_133_INFORMATION_AND_COMMUNICATION_TECHNOLOGIES,144;WB_133_INFORMATION_AND_COMMUNICATION_TECHNOLOGIES,1568;WB_133_INFORMATION_AND_COMMUNICATION_TECHNOLOGIES,2177;MEDIA_MSM,464;MEDIA_MSM,535;MEDIA_MSM,641;MEDIA_MSM,1381;MEDIA_MSM,1479;MEDIA_MSM,2121;TAX_ECON_PRICE,19;TAX_ECON_PRICE,486;TAX_FNCACT_KNIGHT,2087;GENERAL_GOVERNMENT,197;MARITIME_INCIDENT,2233;MARITIME,2233;MANMADE_DISASTER_IMPLIED,2233;EPU_POLICY_REFORM,2243;TAX_ETHNICITY_BLACK,38;'

## V2GCAM field (Global Content Analysis Measures) shown below:

    • Details regarding each of the codes can be found in the GCAM master codebook 
      
   [GDELT GCAM Master Codebook](http://data.gdeltproject.org/documentation/GCAM-MASTER-CODEBOOK.TXT)

      
    • According to the GDELT V2 GKG Codebook, the following string, when paired with it's definition, would 
      translate to a table looking similar as that seen below.
      
      'wc:125,c2.21:4,c10.1:40,v10.1:3.21111111'

        Type						Count
        ----------------------------------
        WordCount:				   125
        General Inquirer Bodypt      4
        SentiWordNet				 40
        SentiWordNet average		 3.21111111


In [172]:
df.iat[0,17]

'wc:391,c12.1:25,c12.10:32,c12.12:12,c12.13:12,c12.14:10,c12.3:13,c12.4:4,c12.5:16,c12.7:24,c12.8:16,c12.9:25,c13.11:1,c13.12:3,c13.6:1,c13.7:1,c14.1:24,c14.10:19,c14.11:41,c14.2:27,c14.3:44,c14.4:1,c14.5:50,c14.6:2,c14.7:5,c14.8:1,c14.9:5,c15.132:1,c15.148:1,c15.15:1,c15.175:1,c15.241:1,c15.76:1,c15.93:1,c16.1:4,c16.10:2,c16.100:6,c16.101:9,c16.102:1,c16.105:1,c16.106:10,c16.109:12,c16.11:3,c16.110:52,c16.111:1,c16.113:1,c16.114:14,c16.115:1,c16.116:17,c16.117:11,c16.118:28,c16.12:24,c16.120:16,c16.121:22,c16.122:5,c16.125:15,c16.126:18,c16.127:33,c16.128:5,c16.129:44,c16.130:5,c16.131:14,c16.134:25,c16.136:2,c16.138:8,c16.139:9,c16.140:10,c16.141:2,c16.142:2,c16.144:1,c16.145:23,c16.146:16,c16.147:1,c16.15:1,c16.152:14,c16.153:21,c16.155:2,c16.156:2,c16.157:5,c16.158:2,c16.159:29,c16.16:10,c16.161:22,c16.162:10,c16.163:18,c16.164:5,c16.168:2,c16.17:2,c16.19:6,c16.2:43,c16.21:5,c16.22:4,c16.23:2,c16.24:3,c16.26:44,c16.28:4,c16.29:3,c16.3:11,c16.30:2,c16.31:20,c16.32:2,c16.33:24,c16.34