In [1]:
import pandas as pd
data = pd.read_csv('ch08/Haiti.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3593 entries, 0 to 3592
Data columns (total 10 columns):
Serial            3593 non-null int64
INCIDENT TITLE    3593 non-null object
INCIDENT DATE     3593 non-null object
LOCATION          3592 non-null object
DESCRIPTION       3593 non-null object
CATEGORY          3587 non-null object
LATITUDE          3593 non-null float64
LONGITUDE         3593 non-null float64
APPROVED          3593 non-null object
VERIFIED          3593 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 280.8+ KB


In [7]:
data.columns

Index([u'Serial', u'INCIDENT TITLE', u'INCIDENT DATE', u'LOCATION',
       u'DESCRIPTION', u'CATEGORY', u'LATITUDE', u'LONGITUDE', u'APPROVED',
       u'VERIFIED'],
      dtype='object')

In [2]:
data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]

Unnamed: 0,INCIDENT DATE,LATITUDE,LONGITUDE
0,05/07/2010 17:26,18.233333,-72.533333
1,28/06/2010 23:06,50.226029,5.729886
2,24/06/2010 16:21,22.278381,114.174287
3,20/06/2010 21:59,44.407062,8.933989
4,18/05/2010 16:26,18.571084,-72.334671
5,26/04/2010 13:14,18.593707,-72.310079
6,26/04/2010 14:19,18.4828,-73.6388
7,26/04/2010 14:27,18.415,-73.195
8,15/03/2010 10:58,18.517443,-72.236841
9,15/03/2010 11:00,18.54779,-72.41001


In [6]:
data['CATEGORY'].notnull().sum()

3587

In [10]:
data.describe()

Unnamed: 0,Serial,LATITUDE,LONGITUDE
count,3569.0,3569.0,3569.0
mean,2081.498459,18.592503,-72.424994
std,1170.311824,0.273695,0.291018
min,4.0,18.041313,-74.452757
25%,1074.0,18.5242,-72.417498
50%,2166.0,18.539269,-72.335
75%,3089.0,18.5618,-72.293939
max,4052.0,19.94063,-71.099489


In [11]:
data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
           (data.LONGITUDE > -75) & (data.LONGITUDE < -70) & data.CATEGORY.notnull()]

In [12]:
len(data)

3569

In [13]:
def to_cat_list(catstr):
    stripped = (x.strip() for x in str(catstr).split(','))
    return [x for x in stripped if x]

def get_all_categories(cat_series):
    cat_sets = (set(to_cat_list(x)) for x in cat_series)
    return sorted(set.union(*cat_sets))

def get_english(cat):
    code, names = cat.split('.')
    if '|' in names:
        names = names.split(' | ')[1]
    return code, names.strip()

get_english('2. Urgences logistiques | Vital Lines')

('2', 'Vital Lines')

In [14]:
all_cats = get_all_categories(data.CATEGORY)

In [17]:
english_mapping = dict(get_english(x) for x in all_cats)
english_mapping['6c']

'Earthquake and aftershocks'

In [21]:
import numpy as np
from pandas import DataFrame
def get_code(seq):
    return [x.split('.')[0] for x in seq if x]
all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index)

In [22]:
dummy_frame.ix[:, :6].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3569 entries, 0 to 3592
Data columns (total 6 columns):
1     3569 non-null float64
1a    3569 non-null float64
1b    3569 non-null float64
1c    3569 non-null float64
1d    3569 non-null float64
2     3569 non-null float64
dtypes: float64(6)
memory usage: 195.2 KB


In [24]:
for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

In [25]:
data = data.join(dummy_frame.add_prefix('category_'))

In [26]:
data.ix[:, 10:15].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3569 entries, 0 to 3592
Data columns (total 5 columns):
category_1     3569 non-null float64
category_1a    3569 non-null float64
category_1b    3569 non-null float64
category_1c    3569 non-null float64
category_1d    3569 non-null float64
dtypes: float64(5)
memory usage: 167.3 KB


In [29]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,
                   lllon=-75, urlon=-71):
    # create polar sterographic Basemap instance.
    m = Basemap(ax=ax, projection='stere',
               lon_0 = (urlon + lllon) / 2,
               lat_0 = (urlat + lllat) / 2,
               llcrnrlat = lllat, urcrnrlat = urlat,
               llcrnrlon = lllon, urcrnrlon=urlon,
               resoulution='f')
    # draw coastlines, state and country boundaries, edge of map.
    m.drawcoastlines()
    m.drawstates()
    m.drawcountries()
    return m

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
to_plot = ['2a', '1', '3c', '7a']
lllat=17.25; urlat=20.25; lllon=-75; urlon=-71
for code, ax in zip(to_plot, axes.flat):
    m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,
                       lllon=lllon, urlon=urlon)
    cat_data = data[data['category_%s' % code] == 1]
    # compute map proj coordinates.
    x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)
    m.plot(x, y, 'k.', alpha=0.5)
    ax.set_title('%s: %s' % (code, english_mappingp[code]))
plt.show()

ImportError: No module named basemap