# Exploring the location data of disaster tweets

## Loading the data

In [1]:
import pandas as pd
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
test = pd.read_csv('data/test.csv')

In [3]:
locations = train.loc[~train['location'].isnull(), ['location','target']]
locations.head()

Unnamed: 0,location,target
31,Birmingham,1
32,Est. September 2012 - Bristol,0
33,AFRICA,1
34,"Philadelphia, PA",0
35,"London, UK",0


In [27]:
locations_test = pd.DataFrame(test.loc[~test['location'].isnull(), 'location'])
locations_test.head()

Unnamed: 0,location
15,London
16,Niall's place | SAF 12 SQUAD |
17,NIGERIA
18,Live On Webcam
19,"Los Angeles, Califnordia"


# Missing location as a feature?

In [5]:
train['has_location'] = ~train['location'].isnull()
available_locations = sum(train['has_location'])/len(train)
print(f'Available locations {round(100*available_locations)}%')

Available locations 67%


In [6]:
location_target_mean = train.loc[train['has_location'], 'target'].mean()
print(f'Disaster tweets with location provided: {round(100*location_target_mean)}%')

Disaster tweets with location provided: 43%


In [7]:
nolocation_target_mean = train.loc[~train['has_location'], 'target'].mean()
print(f'Disaster tweets with no location provided: {round(100*nolocation_target_mean)}%')

Disaster tweets with no location provided: 42%


Missing location on its own doesn't seem to be a very strong feature. Can we find some more information in the locations themselves?

# Use the Nominatim geocoder to find the coordinates of the locations

In [8]:
from geopandas.tools import geocode

### Encode a single location

In [9]:
locations['location'].iloc[0]

'Birmingham'

In [10]:
from geopy import Nominatim
locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode(locations['location'].iloc[0])

In [11]:
location

Location(Birmingham, West Midlands Combined Authority, England, United Kingdom, (52.4796992, -1.9026911, 0.0))

In [12]:
print(location.longitude, location.latitude, location.altitude)

-1.9026911 52.4796992 0.0


In [13]:
location.point

Point(52.4796992, -1.9026911, 0.0)

### Encode all

In [28]:
import os
from geopy.extra.rate_limiter import RateLimiter
from geopy import Nominatim

def encode_locations(locations, pickle_file):
    coded_locations = pd.Series(dtype='object')
    offset = 0
    if os.path.exists(pickle_file):
        coded_locations = pd.read_pickle(pickle_file)
        offset = len(coded_locations)
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    CHUNK_SIZE = 60
    for i in range(offset, len(locations), CHUNK_SIZE):
        print(i)
        end = i + CHUNK_SIZE
        if end>len(locations):
            end = len(locations)    
        coded_locations = coded_locations.append(locations.iloc[i:end]['location'].apply(geocode))
        coded_locations.to_pickle(pickle_file)
    return coded_locations

In [29]:
coded_locations = encode_locations(locations, 'coded_locations.pickle')

In [30]:
coded_locations_test = encode_locations(locations_test, 'coded_locations_test.pickle')

0
60
120
180
240
300
360
420
480
540
600
660
720
780
840
900
960
1020
1080
1140
1200
1260
1320
1380
1440
1500
1560
1620
1680
1740
1800


RateLimiter caught an error, retrying (0/2 tries). Called with (*('SP - Brasil #1',), **{}).
Traceback (most recent call last):
  File "/home/herwin/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/herwin/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/home/herwin/anaconda3/lib/python3.8/http/client.py", line 1332, in getresponse
    response.begin()
  File "/home/herwin/anaconda3/lib/python3.8/http/client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "/home/herwin/anaconda3/lib/python3.8/http/client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/herwin/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "/home/herwin/anaconda3

RateLimiter swallowed an error after 2 retries. Called with (*('SP - Brasil #1',), **{}).
Traceback (most recent call last):
  File "/home/herwin/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/home/herwin/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/home/herwin/anaconda3/lib/python3.8/http/client.py", line 1332, in getresponse
    response.begin()
  File "/home/herwin/anaconda3/lib/python3.8/http/client.py", line 303, in begin
    version, status, reason = self._read_status()
  File "/home/herwin/anaconda3/lib/python3.8/http/client.py", line 264, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/home/herwin/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "/home/herwin/anaconda3/li

1860
1920
1980
2040
2100


In [76]:
len(coded_locations)

5080

In [77]:
len(locations)

5080

## Looking at missing geocodes

In [78]:
print(f"Missing: {round(100*sum(coded_locations.isnull())/len(coded_locations))}%")

Missing: 19%


In [87]:
points = coded_locations.apply(lambda loc: tuple(loc.point) if loc else None)
points
train[['latitude', 'longitude', 'altitude']] = pd.DataFrame(points.tolist(), index=points.index)

In [88]:
coded_locations[coded_locations.isnull()]="unknown"

In [89]:
train['coded_locations'] = coded_locations

In [90]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               7613 non-null   int64  
 1   keyword          7552 non-null   object 
 2   location         5080 non-null   object 
 3   text             7613 non-null   object 
 4   target           7613 non-null   int64  
 5   coded_locations  5080 non-null   object 
 6   geocoded         7613 non-null   bool   
 7   latitude         4116 non-null   float64
 8   longitude        4116 non-null   float64
 9   altitude         4116 non-null   float64
dtypes: bool(1), float64(3), int64(2), object(4)
memory usage: 542.8+ KB


In [92]:
train[train['coded_locations']=='unknown'].head(10)

Unnamed: 0,id,keyword,location,text,target,coded_locations,geocoded,latitude,longitude,altitude
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0,unknown,False,,,
40,59,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,unknown,False,,,
48,68,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0,unknown,False,,,
59,83,ablaze,"Edmonton, Alberta - Treaty 6",How the West was burned: Thousands of wildfire...,1,unknown,False,,,
61,86,ablaze,Inang Pamantasan,Progressive greetings!\n\nIn about a month stu...,0,unknown,False,,,
62,89,ablaze,Twitter Lockout in progress,Rene Ablaze &amp; Jacinta - Secret 2k13 (Falle...,0,unknown,False,,,
67,96,accident,CLVLND,'I can't have kids cuz I got in a bicycle acci...,0,unknown,False,,,
81,118,accident,Your Sister's Bedroom,I was in a horrible car accident this past Sun...,1,unknown,False,,,
102,146,aftershock,Instagram - @heyimginog,@afterShock_DeLo scuf ps live and the game... cya,0,unknown,False,,,
108,158,aftershock,Instagram - @heyimginog,@afterShock_DeLo im speaking from someone that...,0,unknown,False,,,


Missing geocodes seem to be mostly non-existing locations in the world. Is that a good feature?

In [93]:
train['geocoded'] = (train['coded_locations']!="unknown") & (~train['coded_locations'].isnull())

In [94]:
locations = train[~train['location'].isnull()]

In [95]:
unknown_locations = locations[locations['coded_locations']=="unknown"]
print(f"Disasters at unknown locations: {round(100*unknown_locations['target'].mean())}%") 

Disasters at unknown locations: 38%


In [96]:
known_locations = locations[locations['coded_locations']!="unknown"]
print(f"Disasters at known locations: {round(100*known_locations['target'].mean())}%") 

Disasters at known locations: 44%


Not such a super distinction, but might help.

## Looking at the location of the geocodes found

In [97]:
number_coded = sum((train['coded_locations'].isnull())&(train['coded_locations']!="unknown"))
print(f"Percent geocoded in trainingset: {round(100*number_coded/len(train))}%")

Percent geocoded in trainingset: 33%


### Disaster tweets

In [150]:
from folium.plugins import HeatMap
locs = known_locations[known_locations['target']==1]
hm_wide = HeatMap( list(zip(locs.latitude.values, locs.longitude.values, [1]*len(locs))),
                   min_opacity=0.2,
                   max_val=2,
                   radius=7, blur=12, 
                   max_zoom=1, 
                 )
hmap = folium.Map(location=[0,0],zoom_start=2)
hmap.add_child(hm_wide)
hmap

### Non disaster tweets

In [149]:
from folium.plugins import HeatMap
locs = known_locations[known_locations['target']==0]
hm_wide = HeatMap( list(zip(locs.latitude.values, locs.longitude.values, [1]*len(locs))),
                   min_opacity=0.2,
                   max_val=2,
                   radius=7, blur=12, 
                   max_zoom=1, 
                 )
hmap = folium.Map(location=[0,0],zoom_start=2)
hmap.add_child(hm_wide)

Location seems to mostly correlate with densly populated areas. However, there may be some differences in amounts of tweets send, e.g in Bangladesh, India, Southamerica, eastern Europe.