# GDELT News Data Extraction for GTA Area

In [1]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html')
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href")

# separate out those links that begin with four digits 
file_list = [x for x in link_list if str.isdigit(x[0:4])]

# preview the list
file_list[0:5]

['20190407.export.CSV.zip',
 '20190406.export.CSV.zip',
 '20190405.export.CSV.zip',
 '20190404.export.CSV.zip',
 '20190403.export.CSV.zip']

In [2]:
# geo-fence
lt1 = 43.403221
lt2 = 43.855401
lg1 = -79.639319
lg2 = -78.905820

# days back to process
days_back = 1

In [7]:
import os.path
import urllib
import zipfile
import glob
import operator

infilecounter = 0
outfilecounter = 0

dir_path = os.getcwd()
print (dir_path)
                           
local_path = dir_path + '\\'
for compressed_file in file_list[infilecounter:]:
    print (compressed_file),
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print ('...downloading,'),
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                                   filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print ('...extracting,'),
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print ('...parsing,'),
    for infile_name in glob.glob(local_path + 'tmp/*'):
        outfile_name = local_path + 'gta_data\\' + 'gta' + '%04i.tsv'%outfilecounter    
    
        # open the infile and outfile
        with open(infile_name,  mode='r', encoding="utf8") as infile, \
             open(outfile_name, mode='w', encoding="utf8") as outfile:
            
            for line in infile:
                vals = line.split('\t')
                
                # extract geo-coordinates
                try:
                    lat  = float(vals[53]) # ActionGeo_Lat
                    long = float(vals[54]) # ActionGeo_Long
                except Exception as e:
                    # means no coordinates provided, skipping
                    continue
                
                # only use events inside geo-fence
                if  long >= lg1 and long <= lg2 and lat >= lt1 and lat <= lt2:
                    outfile.write(line)
                
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
        
    infilecounter +=1
    if infilecounter >= days_back:
        print ('done')
        break

C:\Users\ibaranov\Downloads\York\group-projects\CSDA1050-CAP\GDELT2
20190407.export.CSV.zip
...extracting,
...parsing,
done


In [8]:
import glob
import pandas as pd

# Get the GDELT field names from an external helper file
colnames = pd.read_excel('CSV.header.fieldids.xlsx', sheet_name='Sheet1', 
                         index_col='Column ID', usecols=1)['Field Name']

# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+'gta_data/'+'*')
DFlist = []
for active_file in files:
    print (active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID']))

# Merge the file-based dataframes and serialize the dataframe
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+'-gta'+'.pickle')    
    
# remove the temporary files
for active_file in files:
    os.remove(active_file)

C:\Users\ibaranov\Downloads\York\group-projects\CSDA1050-CAP\GDELT2\gta_data\gta0000.tsv


In [9]:
#list(DF)
print(DF.iloc[0])

SQLDATE                                                           20190407
MonthYear                                                           201904
Year                                                                  2019
FractionDate                                                     2019.2658
Actor1Code                                                             NaN
Actor1Name                                                             NaN
Actor1CountryCode                                                      NaN
Actor1KnownGroupCode                                                   NaN
Actor1EthnicCode                                                       NaN
Actor1Religion1Code                                                    NaN
Actor1Religion2Code                                                    NaN
Actor1Type1Code                                                        NaN
Actor1Type2Code                                                        NaN
Actor1Type3Code          