In [46]:
import json
import os
import re
import requests
import urllib.parse
import random
import time

import numpy as np
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from fuzzywuzzy import fuzz

## Tweets were collected with center point in several cities and a varying radius, dependant on the size of the country

### See file: get_tweets_countries.py

### Now we merge all of the tweets per country and save it with the respected ISO-3 codes, needed to later get the correct geojson polygons for each country, to visualize on the map

In [71]:
countries = {'BEL':{'cities':['Antwerp','Brussels','Liege','Mons'],'name':'Belgium'},
            'NLD':{'cities':['Amsterdam','Eindhoven','Groningen','Rotterdam','Utrecht','Zwolle'],'name':'The Netherlands'},
            'ESP':{'cities':['Barcelona','Bilbao','Madrid','Sevilla','Valencia','Zaragoza'],'name':'Spain'},
            'USA':{'cities':['Atlanta','Austin','Chicago','NewYork','Sanfrancisco','Seattle'],'name':'The United States of America'},
            'DEU':{'cities':['Berlin','Frankfurt','Hamburg','Munich'],'name':'Germany'},
            'CHN':{'cities':['Beijing','Wuhan'],'name':'China'},
            'ITA':{'cities':['Milan','Rome'],'name':'Italy'},
            'GBR':{'cities':['Birmingham','Glasgow','Leeds','London','Manchester'],'name':'The United Kingdom'},
            'IND':{'cities':['Delhi','Mumbai'],'name':'India'},
            'SWE':{'cities':['Stockholm'],'name':'Sweden'},
            'FIN':{'cities':['Helsinki'],'name':'Finland'},
            'DNK':{'cities':['Copenhagen'],'name':'Denmark'},
            'PAK':{'cities':['Karachi','Islamabad'],'name':'Pakistan'},
            'AUS':{'cities':['Perth','Sydney'],'name':'Australia'},
            'ROU':{'cities':['Bucharest'],'name':'Romania'},
            'HUN':{'cities':['Budapest'],'name':'Hungary'},
            'EGY':{'cities':['Cairo'],'name':'Egypt'},
            'UKR':{'cities':['Kiev'],'name':'Ukraine'},
            'IRN':{'cities':['Tehran'],'name':'Iran'},
            'AUT':{'cities':['Vienna'],'name':'Austria'},
            'POL':{'cities':['Warsaw'],'name':'Poland'},
            'NZL':{'cities':['Auckland'],'name':'New Zealand'},
            'NGA':{'cities':['Kano'],'name':'Nigeria'}
            }

cities = set()
country_names = []
for k,v in countries.items():
    cities |= set(v['cities'])
    country_names.append(v['name'])
print(sorted(country_names))

['Australia', 'Austria', 'Belgium', 'China', 'Denmark', 'Egypt', 'Finland', 'Germany', 'Hungary', 'India', 'Iran', 'Italy', 'New Zealand', 'Nigeria', 'Pakistan', 'Poland', 'Romania', 'Spain', 'Sweden', 'The Netherlands', 'The United Kingdom', 'The United States of America', 'Ukraine']


In [72]:
# Checks if we have manually put in all of the cities, should print nothing
n_total = 0
for f in os.listdir('tweets_new'):
    if f.endswith('.json'):
        city = f.split('tweet')[-1].replace('.json','')
        tweets = json.load(open('tweets_new/'+f,'r'))['tweets']
        n_total += len(tweets)
        if city not in cities:
            print(city)
print('Total tweets:',n_total)

s_AUS
s_AUT
s_BEL
s_CHN
s_DEU
s_DNK
s_EGY
s_ESP
s_FIN
s_GBR
s_HUN
s_IND
s_IRN
s_ITA
s_NGA
s_NLD
s_NZL
s_PAK
s_POL
s_ROU
s_SWE
s_UKR
s_USA
Total tweets: 56532


## Geocode tweets based on user location

### Almost none of the tweets have coordinates attached, so we use geocoders and fuzzy matching to get them

In [80]:
def geocode_request(tweet):    
    if tweet['coordinates'] is not None:
        return tweet['coordinates']
    elif tweet['place_coords'] is not None:
        return np.array(tweet['place_coords'][0]).mean(axis=0).tolist()
    
    # Order: Geonames => TomTom => Google Maps 
    # If we get matches, return the best based on string similarity
    scores = []
    query = tweet['user_location'].lower()
    if len(query) <= 3:
        return None
    
    for geocoder in [geonames_request, tomtom_request, google_geocode_request]:
        result = geocoder(query)
        
        for r in result:
            name = r['name']
            try:
                scores.append(fuzz.partial_ratio(query, name.lower()))
            # When user location is not in our alfabet (e.g. russian), this will give an error, so we return None
            except ValueError:
                return None
        
        if len(result) > 0:
            result = result[np.argmax(scores)]
            return [result['lng'],result['lat']]

    return None


# This code was reused from another project
def geonames_request(query, lat=None, lng=None, radius=5000, max_rows=40):
    usernames = ['XXX', 'XXX', 'XXX']
    idx = random.randint(0, 2)
    username = usernames[idx]
    query = urllib.parse.quote(query, safe='')
    url = 'http://api.geonames.org/searchJSON?q=' + query + '&lang=local&&country=BE&fuzzy=0.3&maxRows=' + str(max_rows) + '&username=' + username
    if radius is not None:
        radius = '&radius=' + str(radius / 1000)
        url += radius
    if lat is not None and lng is not None:
        se, nw = get_bbox_from_radius(lat, lng, radius / 1000)
        s = str(se[0])
        e = str(se[1])
        n = str(nw[0])
        w = str(nw[1])
        bounds = '&south=' + s + '&north=' + n + '&west=' + w + '&east=' + e
        url += bounds

    response = requests.get(url)

    geonames_result = response.json()

    result = []

    duplicate = False
    unique = {}

    for i, g in enumerate(geonames_result['geonames']):

        obj = {'lat': '', 'lng': '', 'name': '', 'type': ''}
        type_ = ''
        name = ''

        if 'lat' in g:
            obj['lat'] = float(g['lat'])
        if 'lng' in g:
            obj['lng'] = float(g['lng'])
        # Prefer to use name for local
        if 'name' in g:
            name = g['name']
            obj['name'] = name
        if 'fcl' in g:
            fcl = g['fcl']
            if fcl == 'P':
                type_ = 'locality'
            elif fcl == 'A':
                type_ = 'administrative_area'
            elif fcl == 'H':
                type_ = 'stream,lake'
            elif fcl == 'R':
                type_ = 'road'
            elif fcl == 'L':
                type_ = 'parks,area'
            elif fcl == 'V':
                type_ = 'forest'
            elif fcl == 'T':
                type_ = 'mountain,hill'
            elif fcl == 'S':
                type_ = 'spot,building,farm'
            else:
                type_ = fcl

            obj['type'] = type_

        if name in unique:
            
            idx = unique[name]
            duplicate = result[idx]

            if duplicate['type'] == 'administrative_area' and type_ == 'locality':
                result[idx] = obj
            '''
            elif duplicate['type'] == 'locality' and type_ == 'administrative_area':
                continue
            '''
        else:
            unique[obj['name']] = i

        result.append(obj)

    return result

# This code was reused from another project
def tomtom_request(query, lat=None, lng=None, radius=5000, idx_set=None):
    """
    Fuzzy geocode query in a specified area
    :param query:
    :param idx_set:
    :param lat:
    :param lng:
    :param radius:
    :return:
    """
    query = urllib.parse.quote(query, safe='')

    url = 'https://api.tomtom.com/search/2/search/' + query + \
          '.json?typeahead=true&countrySet=BE&limit=100&minFuzzyLevel=1&maxFuzzyLevel=4&language=nl-BE&key' \
          '=XXX'

    if idx_set is None:
        idx_set = 'Geo,Addr,PAD,Str'

    url += '&idxSet=' + idx_set

    if lat is not None and lng is not None:
        url += '&lat=' + str(lat) + '&lon=' + str(lng)
    if radius is not None:
        url += '&radius=' + str(radius)
    try:
        response = requests.get(url)
        tomtom_result = response.json()
    except BaseException as e:
        print('ERROR TOMTOM REQUEST')
        print('url', url)
        print(response)
        print(e)
        return []

    res = []
    results = tomtom_result['results']

    def lower_first_letter(s):
        return s[:1].lower() + s[1:] if s else ''

    for r in results:
        obj = {}
        type_ = r['type']
        addr = r['address']
        p = r['position']

        if type_ == 'Geography':
            entity_type = lower_first_letter(r['entityType'])
            if entity_type in addr:
                name = addr[entity_type]
            elif 'municipalitySubdivision' in addr:
                name = addr['municipalitySubdivision']
            else:
                name = addr['freeformAddress']
                print('freeformaddres')

        elif type_ == 'Street' or type_ == 'Point Address' or type_ == 'Address Range':
            name = addr['streetName']

        else:
            print('\nTYPEEE', type_, '\n')
            name = 'Unknown Type - TomTom'
            print(r)

        if 'dist' in r:
            obj['dist'] = r['dist']

        obj['lat'] = float(p['lat'])
        obj['lng'] = float(p['lon'])
        obj['name'] = name
        obj['type'] = type_

        res.append(obj)

    return res

# This code was reused from another project
def google_geocode_request(query, lat=None, lng=None, radius=5000):
    """
    Fuzzy geocoding query in a specified area
    :param query:
    :param lat:
    :param lng:
    :param radius:
    :return:
    """

    query = urllib.parse.quote(query, safe='')
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + query + \
          '&language=nl&region=be&key=XXX'

    if lat is not None and lng is not None:
        se, nw = get_bbox_from_radius(lat, lng, radius / 1000)
        bounds = str(se[0]) + ',' + str(se[1]) + '|' + str(nw[0]) + ',' + str(nw[1])
        url += '&bounds=' + bounds

    response = requests.get(url)
    results = []
    google_result = response.json()

    def check_int(n):
        try:
            n = int(n)
            return True
        except BaseException as e:
            return False

    for r in google_result['results']:
        addr = r['address_components']
        name = ''
        type_ = ''
        for a in addr:

            # ignore house numbers as they are listed first
            if a['types'] == ['street_number'] or check_int(a['long_name']):
                continue

            name = a['long_name']
            type_ = ','.join(a['types'])
            break

        loc = r['geometry']['location']

        obj = {'lat': loc['lat'], 'lng': loc['lng'], 'name': name, 'type': type_}
        results.append(obj)

    filtered = []
    if lat is not None and lng is not None:
        for r in results:
            lat = r['lat']
            lng = r['lng']

            if lat < se[0] or lat > nw[0] or lng < nw[1] or lng > se[1]:
                pass
            else:
                filtered.append(r)

        return filtered
    else:
        return results


## Aggregate per country and perform sentiment analysis + text processing

In [89]:
countries_geojson = json.load(open('countries.geojson','r'))
try:
    geocode_dict = json.load(open('geocode_dict.json','r'))
except FileNotFoundError:
    geocode_dict = {}
    
# lat,lng pairs    
city_coordinates = {
    'Madrid':[40.4168,-3.7038],
    'Barcelona':[41.398371,2.1741],
    'Sevilla':[37.382826,-5.973167],
    'Zaragoza':[41.64531,-0.884861],
    'Bilbao':[43.260919,-2.938764],
    'Valencia':[39.466667,-0.366667],
    'Brussels':[50.833333,4.33333],
    'Antwerp':[51.213886,4.401514],
    'Liege':[50.638674,5.570228],
    'Mons':[50.45527,3.951623],
    'Rome':[41.9,12.48333],
    'Milan':[45.466667,9.2],
    'Amsterdam':[52.35,4.916667],
    'Rotterdam':[51.916667,4.5],
    'Utrecht':[52.093813,5.119095],
    'Eindhoven':[51.45,5.466667],
    'Groningen':[53.216667,6.55],
    'Zwolle':[52.505751,6.085822],
    'London':[51.514248,-0.093145],
    'Birmingham':[52.466667,-1.9166667],
    'Manchester':[53.5,-2.216667],
    'Leeds':[53.8,-1.583333],
    'Glasgow':[55.833333,-4.25],
    'Atlanta':[33.753746,-84.386330],
    'Austin':[30.266666,-97.733330],
    'NewYork':[40.730610,-73.935242],
    'Chicago':[41.881832,-87.623177],
    'Sanfrancisco':[37.773972,-122.431297],
    'Seattle':[47.608013,-122.335167],
    'Vienna':[48.2,16.366667],
    'Copenhagen':[55.666667,12.583333],
    'Stockholm':[59.333333,18.05],
    'Berlin':[52.516667,13.4],
    'Frankfurt':[50.11552,8.684167],
    'Hamburg':[53.575323,10.01534],
    'Munich':[48.15,11.583333],
    'Budapest':[47.5,19.083333],
    'Warsaw':[52.25,21.0],
    'Kiev':[50.433333,30.516667],
    'Bucharest':[44.433333,26.1],
    'Helsinki':[60.175556,24.934167],
    'Karachi':[24.9056,67.0822],
    'Islamabad':[33.69,73.0551],
    'Delhi':[28.651952,77.231495],
    'Mumbai':[18.987807,72.836447],
    'Beijing':[39.928819,116.388869],
    'Wuhan':[30.583333,114.266667],
    'Cairo':[30.07708,31.285909],
    'Kano':[12.002381,8.51316],
    'Tehran':[35.705,51.4216],
    'Sydney':[-33.861481,151.205475],
    'Perth':[-31.95224,115.861397],
    'Auckland':[-36.866667,174.766667]
}

In [91]:
pattern = re.compile('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)')
lemmatizer = WordNetLemmatizer()
#source: https://github.com/trinker/lexicon/issues/2
pronouns = {"he", "her", "hers", "herself", "him", "himself", "his", "i", 
"it", "its", "me", "mine", "my", "myself", "our", "ours", "ourselves", 
"she", "thee", "their", "them", "themselves", "they", "thou", 
"thy", "thyself", "us", "we", "ye", "you", "your", "yours", "yourself",
"we"}
# some additional 'junk' words we want to avoid in the wordclouds
other = {'amp','like','also','day','year','month','even','want','would','one','two','three','four','five','really',
        'get','let','ever'}
stop_words = set(stopwords.words('english'))
ignored_words = stop_words | pronouns | other

unique_loc = set()

for country,obj in countries.items():
    print('Current:',country)
    merged = []
    words = {}
    avg_polarity = 0
    n = 0
    geojson = None
    # get correct geojson
    for f in countries_geojson['features']:
        if f['properties']['ISO_A3'] == country:
            geojson = f
            break
    assert geojson is not None,'NO GEOJSON FOUND FOR '+country
        
    t1 = time.time()
    for city in obj['cities']:
        print('city',city)
        fname = 'tweets_new/tweet'+city+'.json'
        tweets = json.load(open(fname,'r'))['tweets']
        for tweet in tweets:
            # get geolocation, save results to speed up duplicate locations
            loc = tweet['user_location'].lower()
            if len(loc) > 3:
                if loc in geocode_dict:
                    coordinates = geocode_dict[loc]
                else:
                    coordinates = geocode_request(tweet)
                    geocode_dict[loc] = coordinates
                    if coordinates is None:
                        # change to lng,lat
                        coordinates = city_coordinates[city][::-1]

                    # save after each api request, so when we have an error it is still saved
                    json.dump(geocode_dict,open('geocode_dict.json','w'))
            else:
                coordinates = None
                
            # preprocessing
            processed_text = re.sub(pattern, ' ', tweet['text'].lower())
            processed_text = processed_text.replace('  ',' ').strip()
        
            # sentiment
            polarity = TextBlob(processed_text).sentiment.polarity
            # remove stopwords and lemmatize
            tokens = word_tokenize(processed_text) 
            tokens = [lemmatizer.lemmatize(t) for t in tokens]
            tokens = [t for t in tokens if t not in ignored_words] 
            tokens = [t for t in tokens if len(t) > 2 and not t.isnumeric()]
            processed_text = ' '.join(t for t in tokens)
            for t in tokens:
                if t in words:
                    words[t] += 1
                else:
                    words[t] = 1
            
            tweet['polarity'] = polarity
            tweet['city_name'] = city
            tweet['processed_text'] = processed_text
            tweet['coordinates'] = coordinates
            tweet['country'] = country
            unique_loc.add(tweet['user_location'].lower())
            avg_polarity += polarity
            n += 1
         
        merged.extend(tweets)
    print('Time elapsed:',time.time()-t1)
    print('Total unique locations:',len(unique_loc))
    print()
    avg_polarity /= n
    # Get top 100 words
    top_words = dict(sorted(words.items(),reverse=True,key=lambda x:x[1])[:100])
    #print('Top words:',top_words)
    json.dump({'words':top_words,'polarity':avg_polarity,'tweets':merged,'geojson':geojson},open('tweets_new/tweets_'+country+'.json','w'))
    

Current: BEL
city Antwerp
city Brussels
city Liege
city Mons
Time elapsed: 0.768923282623291
Total unique locations: 236

Current: NLD
city Amsterdam
city Eindhoven
city Groningen
city Rotterdam
city Utrecht
city Zwolle
Time elapsed: 1.5333583354949951
Total unique locations: 556

Current: ESP
city Barcelona
city Bilbao
city Madrid
city Sevilla
city Valencia
city Zaragoza
Time elapsed: 1.0839056968688965
Total unique locations: 928

Current: USA
city Atlanta
city Austin
city Chicago
city NewYork
city Sanfrancisco
city Seattle
Time elapsed: 3.4042372703552246
Total unique locations: 2161

Current: DEU
city Berlin
city Frankfurt
city Hamburg
city Munich
Time elapsed: 1.2055394649505615
Total unique locations: 2370

Current: CHN
city Beijing
city Wuhan
Time elapsed: 0.27649521827697754
Total unique locations: 2408

Current: ITA
city Milan
city Rome
Time elapsed: 0.3682861328125
Total unique locations: 2506

Current: GBR
city Birmingham
city Glasgow
city Leeds
city London
city Manchester
T

## For the other dataset of tweets, we determine the country by checking if the coordinates are inside of any of the country polygons

### For each tweet, perform the same sentiment analysis and preprocessing, check in which of the selected countries it was made, if any, and add the country name to the object. This will ensure both datasets are uniform, making further processing much easier

#### Note: This was also possible to do more efficiently after importing the tweets into the database and by using the django GEOS operations, however, this way it was much easier to code and understand.

In [108]:
from shapely.geometry import shape,Point 

shapes = []
country_names = list(countries.keys())
print(country_names)

# Get shape of all the countries we have selected
for country in country_names:
    for f in countries_geojson['features']:
        if f['properties']['ISO_A3'] == country:
            geojson = f
            s = shape(f['geometry'])
            shapes.append(s)


# This may take a while
tweets = json.load(open('tweets.json','r'))['tweets']
print('Total tweets:',len(tweets))
counter = 0
t1 = time.time()
for tweet in tweets:
    
    processed_text = re.sub(pattern, ' ', tweet['text'].lower())
    processed_text = processed_text.replace('  ',' ').strip()
    # sentiment
    polarity = TextBlob(processed_text).sentiment.polarity
    # remove stopwords and lemmatize
    tokens = word_tokenize(processed_text) 
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in ignored_words] 
    tokens = [t for t in tokens if len(t) > 2 and not t.isnumeric()]
    processed_text = ' '.join(t for t in tokens)
  
    tweet['polarity'] = polarity
    tweet['processed_text'] = processed_text
    
    if 'country' not in tweet:
        coords = tweet['coordinates']['coordinates']
        tweet['coordinates'] = coords
        point = Point(coords)
        for i,s in enumerate(shapes):
            if s.contains(point):
                tweet['country'] = country_names[i]
                break
                
    counter += 1
    
    if counter % 5000 == 0:
        print('Finished',counter,'/',len(tweets),'time elapsed:',round(time.time()-t1,3),'s')
        
# save to a different file to avoid any errors that might have mutated the original data
json.dump({'tweets':tweets},open('tweets2.json','w'))

['BEL', 'NLD', 'ESP', 'USA', 'DEU', 'CHN', 'ITA', 'GBR', 'IND', 'SWE', 'FIN', 'DNK', 'PAK', 'AUS', 'ROU', 'HUN', 'EGY', 'UKR', 'IRN', 'AUT', 'POL', 'NZL', 'NGA']
Total tweets: 41367
Finished 5000 / 41367 time elapsed: 93.266 s
Finished 10000 / 41367 time elapsed: 186.814 s
Finished 15000 / 41367 time elapsed: 280.111 s
Finished 20000 / 41367 time elapsed: 373.772 s
Finished 25000 / 41367 time elapsed: 467.022 s
Finished 30000 / 41367 time elapsed: 561.505 s
Finished 35000 / 41367 time elapsed: 656.008 s
Finished 40000 / 41367 time elapsed: 751.309 s


### Note: It wasn't necessary to generate word aggregations, as we can do it in real time on the server, it was needed to do the text processing however, as that takes too long for a realtime app


In [109]:
def add_tweet(tweet):
    polarity = tweet['polarity']
    tokens = tweet['processed_text'].split()
    # pos or neg
    if polarity >= .2:
            index = 1
    elif polarity <= -.2:
        index = 2
    else:
        index = -1

    for t in tokens:
        amounts[0] += 1
        avg_polarities[0] += polarity
        if t in words[0]:
            words[0][t] += 1
        else:
            words[0][t] = 1

        if index > 0:
            amounts[index] += 1
            avg_polarities[index] += polarity
            if t in words[index]:
                words[index][t] += 1

            else:
                words[index][t] = 1

# Need global vars for easy manipulation
# all,pos,neg
words = [{},{},{}]
avg_polarities = [0,0,0]
amounts = [0,0,0]

for country,obj in countries.items():
    t1 = time.time()
    print('Current:',country)
    
    # Need to redeclare for each country
    words = [{},{},{}]
    avg_polarities = [0,0,0]
    amounts = [0,0,0]

    obj_country = json.load(open('tweets_new/tweets_'+country+'.json','r'))
    tweets_country = obj_country['tweets']  
    
    for tweet in tweets_country:
        add_tweet(tweet)
    # Here, only take the tweets in the respective country
    for tweet in tweets:
        if 'country' in tweet and tweet['country'] == country:
            add_tweet(tweet)
            tweets_country.append(tweet)
            
    avg_polarities = [avg_polarities[i]/amounts[i] for i in range(len(amounts))]
     # Get top 100 words for each sentiment
    top_words = [dict(sorted(w.items(),reverse=True,key=lambda x:x[1])[:100]) for w in words]
    
    obj_country['name_iso'] = country
    obj_country['name'] = countries[country]['name']
    obj_country['words'] = top_words
    obj_country['amounts'] = amounts
    obj_country['polarity'] = avg_polarities
    obj_country['tweets'] = tweets_country
        
    print('Time elapsed:',time.time()-t1)
    print()
    # Save in different file just to make sure
    json.dump(obj_country,open('tweets_final/tweets_'+country+'.json','w'))
    


Current: BEL
Time elapsed: 0.03705906867980957

Current: NLD
Time elapsed: 0.05267906188964844

Current: ESP
Time elapsed: 0.04389595985412598

Current: USA
Time elapsed: 0.34644007682800293

Current: DEU
Time elapsed: 0.05076265335083008

Current: CHN
Time elapsed: 0.0400388240814209

Current: ITA
Time elapsed: 0.03120899200439453

Current: GBR
Time elapsed: 0.1801433563232422

Current: IND
Time elapsed: 0.10346651077270508

Current: SWE
Time elapsed: 0.031241893768310547

Current: FIN
Time elapsed: 0.018566608428955078

Current: DNK
Time elapsed: 0.02149486541748047

Current: PAK
Time elapsed: 0.05954432487487793

Current: AUS
Time elapsed: 0.07219719886779785

Current: ROU
Time elapsed: 0.014639854431152344

Current: HUN
Time elapsed: 0.016592025756835938

Current: EGY
Time elapsed: 0.022448301315307617

Current: UKR
Time elapsed: 0.015616178512573242

Current: IRN
Time elapsed: 0.02342510223388672

Current: AUT
Time elapsed: 0.020496129989624023

Current: POL
Time elapsed: 0.023448