# Estruturas de dados Pandas

In [1]:
import numpy as np

In [2]:
data = np.genfromtxt('data/example_data.csv', delimiter=';',
                     names=True, dtype=None, encoding='UTF')
data

array([('2018-10-13 11:10:23.560', '262km NW of Ozernovskiy, Russia', 'mww', 6.7, 'green', 1),
       ('2018-10-13 04:34:15.580', '25km E of Bitung, Indonesia', 'mww', 5.2, 'green', 0),
       ('2018-10-13 00:13:46.220', '42km WNW of Sola, Vanuatu', 'mww', 5.7, 'green', 0),
       ('2018-10-12 21:09:49.240', '13km E of Nueva Concepcion, Guatemala', 'mww', 5.7, 'green', 0),
       ('2018-10-12 02:52:03.620', '128km SE of Kimbe, Papua New Guinea', 'mww', 5.6, 'green', 1)],
      dtype=[('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i8')])

In [3]:
data.shape

(5,)

In [4]:
data.dtype

dtype([('time', '<U23'), ('place', '<U37'), ('magType', '<U3'), ('mag', '<f8'), ('alert', '<U5'), ('tsunami', '<i8')])

In [5]:
%%timeit
max([row[3] for row in data])

21.7 µs ± 1.09 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [6]:
array_dict = {
    col: np.array([row[i] for row in data])
    for i, col in enumerate(data.dtype.names)
}
array_dict

{'time': array(['2018-10-13 11:10:23.560', '2018-10-13 04:34:15.580',
        '2018-10-13 00:13:46.220', '2018-10-12 21:09:49.240',
        '2018-10-12 02:52:03.620'], dtype='<U23'),
 'place': array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
        '42km WNW of Sola, Vanuatu',
        '13km E of Nueva Concepcion, Guatemala',
        '128km SE of Kimbe, Papua New Guinea'], dtype='<U37'),
 'magType': array(['mww', 'mww', 'mww', 'mww', 'mww'], dtype='<U3'),
 'mag': array([6.7, 5.2, 5.7, 5.7, 5.6]),
 'alert': array(['green', 'green', 'green', 'green', 'green'], dtype='<U5'),
 'tsunami': array([1, 0, 0, 0, 1])}

In [7]:
%%timeit
array_dict['mag'].max()

8.83 µs ± 352 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Classe Series

In [8]:
import pandas as pd

In [9]:
place = pd.Series(array_dict['place'], name='place')
place

0          262km NW of Ozernovskiy, Russia
1              25km E of Bitung, Indonesia
2                42km WNW of Sola, Vanuatu
3    13km E of Nueva Concepcion, Guatemala
4      128km SE of Kimbe, Papua New Guinea
Name: place, dtype: object

In [10]:
place.values

array(['262km NW of Ozernovskiy, Russia', '25km E of Bitung, Indonesia',
       '42km WNW of Sola, Vanuatu',
       '13km E of Nueva Concepcion, Guatemala',
       '128km SE of Kimbe, Papua New Guinea'], dtype=object)

In [11]:
np.array([1,1,1]) + np.array([-1,0,1])

array([0, 1, 2])

In [12]:
numbers = np.linspace(0,10, num=5)
x = pd.Series(numbers)
y = pd.Series(numbers, index=pd.Index([1,2,3,4,5]))

x + y

0     NaN
1     2.5
2     7.5
3    12.5
4    17.5
5     NaN
dtype: float64

In [13]:
df = pd.DataFrame(array_dict)
df

Unnamed: 0,time,place,magType,mag,alert,tsunami
0,2018-10-13 11:10:23.560,"262km NW of Ozernovskiy, Russia",mww,6.7,green,1
1,2018-10-13 04:34:15.580,"25km E of Bitung, Indonesia",mww,5.2,green,0
2,2018-10-13 00:13:46.220,"42km WNW of Sola, Vanuatu",mww,5.7,green,0
3,2018-10-12 21:09:49.240,"13km E of Nueva Concepcion, Guatemala",mww,5.7,green,0
4,2018-10-12 02:52:03.620,"128km SE of Kimbe, Papua New Guinea",mww,5.6,green,1


In [14]:
df.dtypes

time        object
place       object
magType     object
mag        float64
alert       object
tsunami      int64
dtype: object

In [15]:
df.index

RangeIndex(start=0, stop=5, step=1)

## Criando DataFrames

In [16]:
import datetime as dt
import numpy as np
import pandas as pd

In [17]:
np.random.seed(0)
pd.Series(np.random.rand(5), name='random')


0    0.548814
1    0.715189
2    0.602763
3    0.544883
4    0.423655
Name: random, dtype: float64

In [18]:
np.random.seed(0)
df = pd.DataFrame(
    {
        'random': np.random.rand(5),
        'text': ['hot', 'warm', 'cool', 'cold', None],
        'truth': [np.random.choice([True, False]) for _ in range(5)]
    },
    index=pd.date_range(
        end=dt.date(2021, 11, 3),
        freq='1D', periods=5, name='date'
    )
)
df

Unnamed: 0_level_0,random,text,truth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-10-30,0.548814,hot,False
2021-10-31,0.715189,warm,True
2021-11-01,0.602763,cool,True
2021-11-02,0.544883,cold,False
2021-11-03,0.423655,,True


In [19]:
!wc -l data/earthquakes.csv

9332 data/earthquakes.csv


## Criando DataFrames a partir de um arquivo csv

In [20]:
# df = pd.read_csv('data/earthquakes.csv') ## Arquivo local
df = pd.read_csv(
    'https://github.com/stefmolin/'
    'Hands-On-Data-Analysis-with-Pandas-2nd-edition'
    '/blob/master/ch_02/data/earthquakes.csv?raw=True'
)
df.head(3)

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.008693,,85.0,",ci37389218,",1.35,ml,...,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventp...
1,,,37389202,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02003,,79.0,",ci37389202,",1.29,ml,...,",ci,",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475253925,https://earthquake.usgs.gov/earthquakes/eventp...
2,,4.4,37389194,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02137,28.0,21.0,",ci37389194,",3.42,ml,...,",ci,",automatic,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,earthquake,",dyfi,focal-mechanism,geoserve,nearby-cities,o...",-480.0,1539536756176,https://earthquake.usgs.gov/earthquakes/eventp...


## Criando DataFrames a partir de um banco de dados SQL

In [21]:
import sqlite3

In [22]:
with sqlite3.connect('data/quakes.db') as connection:
    pd.read_csv('data/tsunamis.csv').to_sql(
        'tsunamis', connection, index=False, if_exists='replace'
    )

In [23]:
with sqlite3.connect('data/quakes.db') as connection:
    tsunamis = pd.read_sql('SELECT * FROM tsunamis', connection)
    
tsunamis.head()

Unnamed: 0,alert,type,title,place,magType,mag,time
0,,earthquake,"M 5.0 - 165km NNW of Flying Fish Cove, Christm...","165km NNW of Flying Fish Cove, Christmas Island",mww,5.0,1539459504090
1,green,earthquake,"M 6.7 - 262km NW of Ozernovskiy, Russia","262km NW of Ozernovskiy, Russia",mww,6.7,1539429023560
2,green,earthquake,"M 5.6 - 128km SE of Kimbe, Papua New Guinea","128km SE of Kimbe, Papua New Guinea",mww,5.6,1539312723620
3,green,earthquake,"M 6.5 - 148km S of Severo-Kuril'sk, Russia","148km S of Severo-Kuril'sk, Russia",mww,6.5,1539213362130
4,green,earthquake,"M 6.2 - 94km SW of Kokopo, Papua New Guinea","94km SW of Kokopo, Papua New Guinea",mww,6.2,1539208835130


## Criando DataFrames a partir de uma API

In [24]:
# Bibliotecas
import datetime as dt
import pandas as pd
import requests

In [29]:
yesterday = dt.date.today() - dt.timedelta(days=1)
api = 'https://earthquake.usgs.gov/fdsnws/event/1/query'
payload = {
    'format': 'geojson',
    'starttime': yesterday - dt.timedelta(days=30),
    'endtime': yesterday
}

response = requests.get(api, params=payload)

In [30]:
response.status_code

200

In [31]:
earthquake_json = response.json()
earthquake_json.keys()

dict_keys(['type', 'metadata', 'features', 'bbox'])

In [32]:
earthquake_json['metadata']

{'generated': 1636213745000,
 'url': 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2021-10-06&endtime=2021-11-05',
 'title': 'USGS Earthquakes',
 'status': 200,
 'api': '1.12.3',
 'count': 9298}

In [33]:
type(earthquake_json['features'])

list

In [34]:
earthquake_json['features'][0]

{'type': 'Feature',
 'properties': {'mag': 1.37,
  'place': '15 km SSE of Sunnyside, Utah',
  'time': 1636070253460,
  'updated': 1636124397340,
  'tz': None,
  'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/uu60466797',
  'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=uu60466797&format=geojson',
  'felt': None,
  'cdi': None,
  'mmi': None,
  'alert': None,
  'status': 'reviewed',
  'tsunami': 0,
  'sig': 29,
  'net': 'uu',
  'code': '60466797',
  'ids': ',uu60466797,',
  'sources': ',uu,',
  'types': ',origin,phase-data,',
  'nst': 5,
  'dmin': 0.00986,
  'rms': 0.06,
  'gap': 194,
  'magType': 'md',
  'type': 'earthquake',
  'title': 'M 1.4 - 15 km SSE of Sunnyside, Utah'},
 'geometry': {'type': 'Point', 'coordinates': [-110.3135, 39.4281667, -1.34]},
 'id': 'uu60466797'}

In [36]:
earthquake_properties_data = [
    quake['properties'] for quake in earthquake_json['features']
]

In [38]:
df = pd.DataFrame(earthquake_properties_data)
df.head()

Unnamed: 0,mag,place,time,updated,tz,url,detail,felt,cdi,mmi,...,ids,sources,types,nst,dmin,rms,gap,magType,type,title
0,1.37,"15 km SSE of Sunnyside, Utah",1636070253460,1636124397340,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",uu60466797,",",uu,",",origin,phase-data,",5.0,0.00986,0.06,194.0,md,earthquake,"M 1.4 - 15 km SSE of Sunnyside, Utah"
1,2.01,"8 km E of P?hala, Hawaii",1636070017760,1636070200760,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",hv72783087,",",hv,",",origin,phase-data,",32.0,,0.11,161.0,md,earthquake,"M 2.0 - 8 km E of P?hala, Hawaii"
2,3.3,"212 km SE of Perryville, Alaska",1636069157970,1636173221040,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,1.0,...,",us7000frjw,ak021e5rg938,",",us,ak,",",origin,phase-data,shakemap,",,,0.77,,ml,earthquake,"M 3.3 - 212 km SE of Perryville, Alaska"
3,1.79,"12km NE of Bella Vista, CA",1636068268860,1636133952382,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,1.0,1.0,,...,",nc73648780,",",nc,",",dyfi,nearby-cities,origin,phase-data,scitech-...",15.0,0.08972,0.13,64.0,md,earthquake,"M 1.8 - 12km NE of Bella Vista, CA"
4,0.66,"62 km N of Karluk, Alaska",1636067778950,1636073065450,,https://earthquake.usgs.gov/earthquakes/eventp...,https://earthquake.usgs.gov/fdsnws/event/1/que...,,,,...,",av91043643,",",av,",",origin,phase-data,",9.0,,0.17,262.0,ml,earthquake,"M 0.7 - 62 km N of Karluk, Alaska"


## Inspecionando um objeto DataFrame

In [39]:
# Bibliotecas
import numpy as np
import pandas as pd

In [40]:
# Carregamento dos dados
df = pd.read_csv('data/earthquakes.csv')
df.head()

Unnamed: 0,alert,cdi,code,detail,dmin,felt,gap,ids,mag,magType,...,sources,status,time,title,tsunami,type,types,tz,updated,url
0,,,37389218,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.008693,,85.0,",ci37389218,",1.35,ml,...,",ci,",automatic,1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475395144,https://earthquake.usgs.gov/earthquakes/eventp...
1,,,37389202,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02003,,79.0,",ci37389202,",1.29,ml,...,",ci,",automatic,1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475253925,https://earthquake.usgs.gov/earthquakes/eventp...
2,,4.4,37389194,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02137,28.0,21.0,",ci37389194,",3.42,ml,...,",ci,",automatic,1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,earthquake,",dyfi,focal-mechanism,geoserve,nearby-cities,o...",-480.0,1539536756176,https://earthquake.usgs.gov/earthquakes/eventp...
3,,,37389186,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.02618,,39.0,",ci37389186,",0.44,ml,...,",ci,",automatic,1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,",-480.0,1539475196167,https://earthquake.usgs.gov/earthquakes/eventp...
4,,,73096941,https://earthquake.usgs.gov/fdsnws/event/1/que...,0.07799,,192.0,",nc73096941,",2.16,md,...,",nc,",automatic,1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,earthquake,",geoserve,nearby-cities,origin,phase-data,scit...",-480.0,1539477547926,https://earthquake.usgs.gov/earthquakes/eventp...


In [41]:
df.columns

Index(['alert', 'cdi', 'code', 'detail', 'dmin', 'felt', 'gap', 'ids', 'mag',
       'magType', 'mmi', 'net', 'nst', 'place', 'rms', 'sig', 'sources',
       'status', 'time', 'title', 'tsunami', 'type', 'types', 'tz', 'updated',
       'url'],
      dtype='object')

### Descrevendo e resumindo os dados

In [42]:
df.describe()

Unnamed: 0,cdi,dmin,felt,gap,mag,mmi,nst,rms,sig,time,tsunami,tz,updated
count,329.0,6139.0,329.0,6164.0,9331.0,93.0,5364.0,9332.0,9332.0,9332.0,9332.0,9331.0,9332.0
mean,2.754711,0.544925,12.31003,121.506588,1.497345,3.651398,19.053878,0.362122,56.899914,1538284000000.0,0.006537,-451.99014,1538537000000.0
std,1.010637,2.214305,48.954944,72.962363,1.203347,1.790523,15.492315,0.317784,91.872163,608030600.0,0.080589,231.752571,656413500.0
min,0.0,0.000648,0.0,12.0,-1.26,0.0,0.0,0.0,0.0,1537229000000.0,0.0,-720.0,1537230000000.0
25%,2.0,0.020425,1.0,66.1425,0.72,2.68,8.0,0.119675,8.0,1537793000000.0,0.0,-540.0,1537996000000.0
50%,2.7,0.05905,2.0,105.0,1.3,3.72,15.0,0.21,26.0,1538245000000.0,0.0,-480.0,1538621000000.0
75%,3.3,0.17725,5.0,159.0,1.9,4.57,25.0,0.59,56.0,1538766000000.0,0.0,-480.0,1539110000000.0
max,8.4,53.737,580.0,355.91,7.5,9.12,172.0,1.91,2015.0,1539475000000.0,1.0,720.0,1539537000000.0


In [43]:
df.alert.unique()

array([nan, 'green', 'red'], dtype=object)

In [44]:
df.alert.value_counts()

green    58
red       1
Name: alert, dtype: int64

In [49]:
df.loc[10:12, df.columns[3]]

10    https://earthquake.usgs.gov/fdsnws/event/1/que...
11    https://earthquake.usgs.gov/fdsnws/event/1/que...
12    https://earthquake.usgs.gov/fdsnws/event/1/que...
Name: detail, dtype: object

In [53]:
df.iloc[10:15, 6:10]

Unnamed: 0,gap,ids,mag,magType
10,57.0,",ci37389162,",0.5,ml
11,186.0,",pr2018286010,",2.77,md
12,76.0,",ci37389146,",0.5,ml
13,157.0,",us1000hbti,",4.5,mb
14,71.0,",nc73096921,",2.13,md


In [63]:
df[df.mag > 2][['title', 'mag']]

Unnamed: 0,title,mag
2,"M 3.4 - 8km NE of Aguanga, CA",3.42
4,"M 2.2 - 10km NW of Avenal, CA",2.16
5,"M 2.6 - 55km ESE of Punta Cana, Dominican Repu...",2.61
9,"M 4.7 - 219km SSE of Saparua, Indonesia",4.70
11,"M 2.8 - 53km SE of Punta Cana, Dominican Republic",2.77
...,...,...
9317,"M 2.6 - 4km SW of Delta, B.C., MX",2.64
9319,"M 4.4 - 58km W of San Antonio de los Cobres, A...",4.40
9320,"M 2.8 - 8km NW of Delta, B.C., MX",2.81
9321,"M 4.3 - 19km NE of Cateel, Philippines",4.30


In [69]:
df.loc[
    (df.mag > 5) | (df.alert == 'red'),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
118,green,6.7,mww,"M 6.7 - 262km NW of Ozernovskiy, Russia",1,earthquake
180,green,5.2,mww,"M 5.2 - 25km E of Bitung, Indonesia",0,earthquake
226,green,5.7,mww,"M 5.7 - 42km WNW of Sola, Vanuatu",0,earthquake
227,,5.2,mb,"M 5.2 - 15km WSW of Pisco, Peru",0,earthquake
258,,5.1,mb,"M 5.1 - 236km NNW of Kuril'sk, Russia",0,earthquake
...,...,...,...,...,...,...
9175,,5.2,mb,"M 5.2 - 126km N of Dili, East Timor",1,earthquake
9176,,5.2,mb,"M 5.2 - 90km S of Raoul Island, New Zealand",0,earthquake
9211,green,6.0,mww,M 6.0 - Southwest Indian Ridge,0,earthquake
9213,,5.1,mb,M 5.1 - South of Tonga,0,earthquake


In [71]:
df.loc[
    (df.place.str.contains(r'CA|California$'))
    & (df.mag > 3.8),
    ['alert', 'mag', 'magType', 'title', 'tsunami', 'type']
]

Unnamed: 0,alert,mag,magType,title,tsunami,type
1465,green,3.83,mw,"M 3.8 - 109km WNW of Trinidad, CA",0,earthquake
2414,green,3.83,mw,"M 3.8 - 5km SW of Tres Pinos, CA",1,earthquake


### Adicionando e removendo dados

In [72]:
df_to_modify = df.copy()

In [73]:
df = pd.read_csv(
    'data/earthquakes.csv',
    usecols=[
        'time', 'title', 'place', 'magType', 'mag', 'alert', 'tsunami'
    ]
)
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0


In [75]:
df['source'] = 'USGS API'
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,source
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,USGS API
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,USGS API
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,USGS API
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,USGS API
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,USGS API


In [76]:
df.place.str.extract(r', (.*$)')[0].sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Australia', 'Azerbaijan', 'B.C., MX', 'Barbuda', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba ', 'British Virgin Islands',
       'Burma', 'CA', 'California', 'Canada', 'Chile', 'China',
       'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'Ecuador region',
       'El Salvador', 'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala',
       'Haiti', 'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Italy', 'Jamaica', 'Japan', 'Kansas',
       'Kentucky', 'Kyrgyzstan', 'Martinique', 'Mauritius', 'Mayotte',
       'Mexico', 'Missouri', 'Montana', 'NV', 'Nevada', 'New Caledonia',
       'New Hampshire', 'New Mexico', 'New Zealand', 'Nicaragua',
       'North Carolina', 'Northern Mariana Islands', 'Oklahoma', 'Oregon',
       'Pakistan', 'Papua New Guinea', 'Peru', 'Philippines',
       'Puerto Rico', 'Roman

In [77]:
df['parsed_place'] = df.place.str.replace(
... r'.* of ', '', regex=True # remove <x> of <x>
... ).str.replace(
... 'the ', '' # remove "the "
... ).str.replace(
... r'CA$', 'California', regex=True # fix California
... ).str.replace(
... r'NV$', 'Nevada', regex=True # fix Nevada
... ).str.replace(
... r'MX$', 'Mexico', regex=True # fix Mexico
... ).str.replace(
... r' region$', '', regex=True # fix " region" endings
... ).str.replace(
... 'northern ', '' # remove "northern "
... ).str.replace(
... 'Fiji Islands', 'Fiji' # line up the Fiji places
... ).str.replace( # remove anything else extraneous from start
... r'^.*, ', '', regex=True
... ).str.strip() # remove any extra spaces

In [80]:
df.parsed_place.sort_values().unique()

array(['Afghanistan', 'Alaska', 'Argentina', 'Arizona', 'Arkansas',
       'Ascension Island', 'Australia', 'Azerbaijan', 'Balleny Islands',
       'Barbuda', 'Bolivia', 'British Virgin Islands', 'Burma',
       'California', 'Canada', 'Carlsberg Ridge',
       'Central East Pacific Rise', 'Central Mid-Atlantic Ridge', 'Chile',
       'China', 'Christmas Island', 'Colombia', 'Colorado', 'Costa Rica',
       'Dominican Republic', 'East Timor', 'Ecuador', 'El Salvador',
       'Fiji', 'Greece', 'Greenland', 'Guam', 'Guatemala', 'Haiti',
       'Hawaii', 'Honduras', 'Idaho', 'Illinois', 'India',
       'Indian Ocean Triple Junction', 'Indonesia', 'Iran', 'Iraq',
       'Italy', 'Jamaica', 'Japan', 'Kansas', 'Kentucky',
       'Kermadec Islands', 'Kuril Islands', 'Kyrgyzstan', 'Martinique',
       'Mauritius', 'Mayotte', 'Mexico', 'Mid-Indian Ridge', 'Missouri',
       'Montana', 'Nevada', 'New Caledonia', 'New Hampshire',
       'New Mexico', 'New Zealand', 'Nicaragua', 'North Carolina',


In [81]:
del df['source']
df.head()

Unnamed: 0,alert,mag,magType,place,time,title,tsunami,parsed_place
0,,1.35,ml,"9km NE of Aguanga, CA",1539475168010,"M 1.4 - 9km NE of Aguanga, CA",0,California
1,,1.29,ml,"9km NE of Aguanga, CA",1539475129610,"M 1.3 - 9km NE of Aguanga, CA",0,California
2,,3.42,ml,"8km NE of Aguanga, CA",1539475062610,"M 3.4 - 8km NE of Aguanga, CA",0,California
3,,0.44,ml,"9km NE of Aguanga, CA",1539474978070,"M 0.4 - 9km NE of Aguanga, CA",0,California
4,,2.16,md,"10km NW of Avenal, CA",1539474716050,"M 2.2 - 10km NW of Avenal, CA",0,California
