In [1]:
#######################################################
# Fetching links from the GDELT Version 2 link      #
#######################################################

import requests
import re
import os
import lxml.html as lh
import urllib.request
import os.path
import urllib.request
import zipfile
import glob
import operator
import pandas as pd
from datetime import datetime
now = datetime.now()

# Get the three links which gets updated in the version 2 of gdelt project
gdelt_base_url = 'http://data.gdeltproject.org/gdeltv2/lastupdate.txt'
file_list = []
data = urllib.request.urlopen(gdelt_base_url)
for line in data:
    lines = line.decode("utf-8")
    zip = re.search("(?P<url>https?://[^\s]+)", lines).group("url")
    file_list.append(zip)

print("The three updated csv files for the current 15 minutes batch: \n")
print(file_list)

The three updated csv files for the current 15 minutes batch: 

['http://data.gdeltproject.org/gdeltv2/20200222173000.export.CSV.zip', 'http://data.gdeltproject.org/gdeltv2/20200222173000.mentions.CSV.zip', 'http://data.gdeltproject.org/gdeltv2/20200222173000.gkg.csv.zip']


In [3]:
#######################################################
# Data Extraction, Parsing and Pre processing is done in this cell     #
#######################################################


gdelt = []
infilecounter = 0
outfilecounter = 0
local_path  = os.getcwd()
for compressed_file in file_list[infilecounter:]:
    if "export" in compressed_file: # we only deal with the exports file
        files = compressed_file.split("/")
        filenames = files[-1]
        while not os.path.isfile(local_path+filenames): 
            print("download")
            urllib.request.urlretrieve(url=compressed_file, 
                               filename=local_path+filenames)

        print("extracting")
        z = zipfile.ZipFile(file=local_path+filenames, mode='r')    
        z.extractall(path=local_path+'tmp/')

        print("parsing")
        for infile_name in glob.glob(local_path+'tmp/*'):
            outfile_name = local_path+'\\process\\'+'%04i.tsv'%outfilecounter

            with open(infile_name, mode='r', encoding="utf-8") as infile, open(outfile_name, mode='w',encoding="utf-8") as outfile:
                for line in infile:
                    gdelt.append(line)
                    outfile.write(line)                      
                outfilecounter +=1
            #Deleting the file after we are finished with the preprocessing of dataset
            os.remove(infile_name) 
        infilecounter +=1
        print("Done")

extracting
parsing
Done


In [3]:
#######################################################
# Data can be extracted by using this Bigquery       # 
#######################################################

# def get_current_data():
#     #run query using Google BigQuery and get results with pandas
#     querytext = """SELECT
#     ActionGeo_CountryCode AS country, FIRST(ActionGeo_Lat) AS lat, FIRST(ActionGeo_Long) AS long, AVG(ABS(FLOAT(GoldsteinScale))*FLOAT(AvgTone)) AS mean_goldstein_x_tone, COUNT(GoldsteinScale) AS event_count, INTEGER(SQLDATE) AS date
#     FROM
#     [gdelt-bq:full.events]
#     WHERE
#     (ActionGeo_CountryCode IS NOT NULL AND ActionGeo_Lat IS NOT NULL AND ActionGeo_Long IS NOT NULL AND GoldsteinScale IS NOT NULL AND AvgTone IS NOT NULL AND SQLDATE IS NOT NULL)
#     GROUP BY
#     (date), (country)
#     ORDER BY
#     date ASC"""
#     project_id = "sp-global"
#     df = pd.io.gbq.read_gbq(querytext, project_id)
#     return df

In [4]:
####################################################################################
# Creating a Dataframe of the preprocessed dataset
# for efficient handling of input stream   #
####################################################################################

df1 = pd.read_csv(local_path+'\\process\\0000.tsv', sep='\t', header=0)
df1.columns = ['GLOBALEVENTID','SQLDATE','MonthYear','Year','FractionDate','Actor1Code','Actor1Name',
'Actor1CountryCode','Actor1KnownGroupCode','Actor1EthnicCode','Actor1Religion1Code','Actor1Religion2Code',
'Actor1Type1Code','Actor1Type2Code','Actor1Type3Code','Actor2Code','Actor2Name','Actor2CountryCode',
'Actor2KnownGroupCode','Actor2EthnicCode','Actor2Religion1Code','Actor2Religion2Code','Actor2Type1Code',
'Actor2Type2Code','Actor2Type3Code','IsRootEvent','EventCode','EventBaseCode','EventRootCode','QuadClass',
'GoldsteinScale','NumMentions','NumSources','NumArticles','AvgTone','Actor1Geo_Type','Actor1Geo_FullName',
'Actor1Geo_CountryCode','Actor1Geo_ADM1Code','Actor1Geo_ADM','Actor1Geo_Lat','Actor1Geo_Long',
'Actor1Geo_FeatureID','Actor2Geo_Type','Actor2Geo_FullName','Actor2Geo_CountryCode',
'Actor2Geo_ADM1Code','Actor1Geo_ADM2','Actor2Geo_Lat','Actor2Geo_Long','Actor2Geo_FeatureID',
'ActionGeo_Type','ActionGeo_FullName','ActionGeo_CountryCode','ActionGeo_ADM1Code',
'ActionGeo_ADM2','ActionGeo_L at','ActionGeo_Long','ActionGeo_FeatureID','DATEADDED','SOURCEURL']

pd.set_option('display.max_columns', None)
df1.head()


Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,Actor1Religion2Code,Actor1Type1Code,Actor1Type2Code,Actor1Type3Code,Actor2Code,Actor2Name,Actor2CountryCode,Actor2KnownGroupCode,Actor2EthnicCode,Actor2Religion1Code,Actor2Religion2Code,Actor2Type1Code,Actor2Type2Code,Actor2Type3Code,IsRootEvent,EventCode,EventBaseCode,EventRootCode,QuadClass,GoldsteinScale,NumMentions,NumSources,NumArticles,AvgTone,Actor1Geo_Type,Actor1Geo_FullName,Actor1Geo_CountryCode,Actor1Geo_ADM1Code,Actor1Geo_ADM,Actor1Geo_Lat,Actor1Geo_Long,Actor1Geo_FeatureID,Actor2Geo_Type,Actor2Geo_FullName,Actor2Geo_CountryCode,Actor2Geo_ADM1Code,Actor1Geo_ADM2,Actor2Geo_Lat,Actor2Geo_Long,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_ADM2,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,907754195,20190222,201902,2019,2019.1425,IGOUNO,UNITED NATIONS,,UNO,,,,IGO,,,AFG,AFGHANISTAN,AFG,,,,,,,,1,190,190,19,4,-10.0,5,1,5,-6.666667,1,Afghanistan,AF,AF,,33.0,66.0,AF,1,Afghanistan,AF,AF,,33.0,66.0,AF,1,Afghanistan,AF,AF,,33.0,66.0,AF,20200222173000,https://www.efe.com/efe/english/world/us-talib...
1,907754196,20190222,201902,2019,2019.1425,JUD,JUDGE,,,,,,JUD,,,,,,,,,,,,,0,20,20,2,1,3.0,10,1,10,-1.95599,2,"Connecticut, United States",US,USCT,,41.5834,-72.7622,CT,0,,,,,,,,2,"Connecticut, United States",US,USCT,,41.5834,-72.7622,CT,20200222173000,https://qctimes.com/business/former-iowa-gover...
2,907754197,20190222,201902,2019,2019.1425,PAK,PAKISTANI,PAK,,,,,,,,USA,THE US,USA,,,,,,,,0,45,45,4,1,5.0,2,1,2,-2.279202,4,"New Delhi, Delhi, India",IN,IN07,17911.0,28.6,77.2,-2106102,3,"Washington, District of Columbia, United States",US,USDC,DC001,38.8951,-77.0364,531871,3,"Washington, District of Columbia, United States",US,USDC,DC001,38.8951,-77.0364,531871,20200222173000,https://www.dnaindia.com/world/report-islamaba...
3,907754198,20190222,201902,2019,2019.1425,PAK,PAKISTANI,PAK,,,,,,,,USA,THE US,USA,,,,,,,,0,45,45,4,1,5.0,2,1,2,-2.279202,4,"Lahore, Punjab, Pakistan",PK,PK04,23677.0,31.5497,74.3436,-2767043,4,"New Delhi, Delhi, India",IN,IN07,17911,28.6,77.2,-2106102,4,"New Delhi, Delhi, India",IN,IN07,17911,28.6,77.2,-2106102,20200222173000,https://www.dnaindia.com/world/report-islamaba...
4,907754199,20190222,201902,2019,2019.1425,PAKGOV,PAKISTANI,PAK,,,,,GOV,,,USA,THE US,USA,,,,,,,,0,45,45,4,1,5.0,4,1,4,-2.279202,4,"Lahore, Punjab, Pakistan",PK,PK04,23677.0,31.5497,74.3436,-2767043,4,"New Delhi, Delhi, India",IN,IN07,17911,28.6,77.2,-2106102,4,"New Delhi, Delhi, India",IN,IN07,17911,28.6,77.2,-2106102,20200222173000,https://www.dnaindia.com/world/report-islamaba...


In [5]:
####################################################################################
# Changing the format of the date in the current dataframe
####################################################################################

import datetime
year = filenames[:4]
month = filenames[4:6]
date = filenames[6:8]
hour = filenames[8:10]
minutes = filenames[10:12]
seconds = "00"
date_time_str = year+ "-" + month + "-" + date + " " + hour + ":" + minutes + ":" + seconds
count = df1['DATEADDED'].count() 
dates = [date_time_str] * count

In [6]:
####################################################################################
# Creating a subset of the dataframe 
####################################################################################


df2 = df1[[ 'AvgTone','GoldsteinScale',  'Actor1Code','ActionGeo_Lat','ActionGeo_Long'  ]]
df2.insert(5, "DATEADDED", dates, True) 
df2.fillna("",inplace=True)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,AvgTone,GoldsteinScale,Actor1Code,ActionGeo_Lat,ActionGeo_Long,DATEADDED
0,-6.666667,-10.0,IGOUNO,33.0,66.0,2020-02-22 17:30:00
1,-1.95599,3.0,JUD,41.5834,-72.7622,2020-02-22 17:30:00
2,-2.279202,5.0,PAK,38.8951,-77.0364,2020-02-22 17:30:00
3,-2.279202,5.0,PAK,28.6,77.2,2020-02-22 17:30:00
4,-2.279202,5.0,PAKGOV,28.6,77.2,2020-02-22 17:30:00


In [10]:
####################################################################################
# As latitudes and longitudes are features which has to be given as inputs
# to the endpoint, Visualizing the locations from where the news are generated
# This map updates for every 15 minutes as the batch of our input updates every 15 mins
####################################################################################

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from itertools import chain

lat = df1['ActionGeo_Lat'].values
long = df1['ActionGeo_Long'].values

margin = .2 
lat_min = min(lat) - margin
lat_max = max(lat) + margin
long_min = min(long) - margin
long_max = max(long) + margin

def draw_map(m, scale=0.2):
    # draw a shaded-relief image
    m.shadedrelief(scale=scale)
    
    
    lats = m.drawparallels(np.linspace(-90, 90, 13))
    lons = m.drawmeridians(np.linspace(-180, 180, 13))

    
    lat_lines = chain(*(tup[1][0] for tup in lats.items()))
    lon_lines = chain(*(tup[1][0] for tup in lons.items()))
    all_lines = chain(lat_lines, lon_lines)
    
    lons, lats = m(long, lat)
    m.scatter(lons, lats, marker = 'o', color='b')
    
    
    for line in all_lines:
        line.set(linestyle='-', alpha=0.3, color='w')
        
fig = plt.figure(figsize=(16, 12), edgecolor='w')
m = Basemap(projection='cyl', resolution=None,
            llcrnrlat=-90, urcrnrlat=90,
            llcrnrlon=-180, urcrnrlon=180, )


draw_map(m)
plt.savefig('map1.png')


KeyError: 'PROJ_LIB'