# Taiwan Earthquake Data

## 1. Download data

In [1]:
import os, re
import pandas as PD
import geopandas as GPD
import numpy as NP
import wget as WGET
import zipfile as ZIP
from tqdm import tqdm as TQDM
from shapely.geometry import Point

In [2]:
url = 'http://opendata2.epa.gov.tw/SOIL00058/'
datalist = ['地震報告詳細資料_1071214_052621',
'地震報告詳細資料_1071216_053905',
'地震報告詳細資料_1071225_125048',
'地震報告詳細資料_1071229_220201',
'地震報告詳細資料_1080104_022157',
'地震報告詳細資料_1080104_071152',
'地震報告詳細資料_1080108_063958',
'地震報告詳細資料_1080108_202935',
'地震報告詳細資料_1080122_112955',
'地震報告詳細資料_1080130_135402',
'地震報告詳細資料_1080130_233017',
'地震報告詳細資料_1080131_063019',
'地震報告詳細資料_1080205_195037',
'地震報告詳細資料_1080208_012035',
'地震報告詳細資料_1080213_024045',
'地震報告詳細資料_1080215_222201',
'地震報告詳細資料_1080217_184132',
'地震報告詳細資料_1080308_110117',
'地震報告詳細資料_1080312_121131',
'地震報告詳細資料_1080313_043148',
'地震報告詳細資料_1080313_125140',
'地震報告詳細資料_1080314_053157',
'地震報告詳細資料_1080316_091627',
'地震報告詳細資料_1080317_211150',
'地震報告詳細資料_1080319_123225',
'地震報告詳細資料_1080403_121140',
'地震報告詳細資料_1080404_101053',
'地震報告詳細資料_1080404_211055',
'地震報告詳細資料_1080406_055056',
'地震報告詳細資料_1080408_151102',
'地震報告詳細資料_1080409_234113',
'地震報告詳細資料_1080410_044116',
'地震報告詳細資料_1080410_072351',
'地震報告詳細資料_1080415_233911',
'地震報告詳細資料_1080418_132514',
'地震報告詳細資料_1080418_134009',
'地震報告詳細資料_1080501_113512',
'地震報告詳細資料_1080502_191425',
'地震報告詳細資料_1080503_061419',
'地震報告詳細資料_1080513_171729',
'地震報告詳細資料_1080522_100030',
'地震報告詳細資料_1080523_144052',
'地震報告詳細資料_1080531_001349',
'地震報告詳細資料_1080604_181202',
'地震報告詳細資料_1080606_035426',
'地震報告詳細資料_1080630_005402',
'地震報告詳細資料_1080802_222042',
'地震報告詳細資料_1080803_064030',
'地震報告詳細資料_1080806_094028',
'地震報告詳細資料_1080806_094029',
'地震報告詳細資料_1080808_060032',
'地震報告詳細資料_1080808_072021',
'地震報告詳細資料_1080816_060050',
'地震報告詳細資料_1080817_232117',
'地震報告詳細資料_1080817_000100',
'地震報告詳細資料_1080818_122156',
'地震報告詳細資料_1080831_160446',
'地震報告詳細資料_1080903_091500',]

In [3]:
# Download data to local directory
todir = './data'
if not os.path.exists(todir):
    os.makedirs(todir)
bars = TQDM(datalist)
for name in bars:
    filename = os.path.join(todir, name)
    # Download
    if not os.path.exists(filename+'.zip'):
        WGET.download(url+name+'.zip', out=filename+'.zip')
    # Un-zip
    if not os.path.exists(filename):
        with ZIP.ZipFile(filename+'.zip', 'r') as zipfile:
            zipfile.extractall(filename)

100%|██████████| 58/58 [00:00<00:00, 21765.20it/s]


## 2. Reconstruct data to DataFrame from .txt

In [4]:
# define column names
data = {'event':[],
        'datetime':[],
        'lon':[],
        'lat':[],
        'depth':[],
        'intensity':[],
        'station_id':[],
        'station_name':[],
        'station_lon':[],
        'station_lat':[],
        'distance':[],
        'az':[],
        'pga_v':[],
        'pga_ns':[],
        'pga_ew':[]}

In [5]:
# Loop earthqauke events from each .txt
# Each event contain many measurments from stations
todir = './data'
bars = TQDM(datalist)
for event, name in enumerate(bars):
    dirname = os.path.join(todir, name)
    filename = None
    # find .txt data
    for path, dirs, files  in os.walk(dirname):
        if path == dirname:
            for f in files:
                if f.find('E.txt') >= 0:
                    filename = f
                    break
            break
    # read .txt data and reconstruct to .csv
    with open(os.path.join(dirname, filename), 'rb') as file:
        contain = [l.decode('utf8', 'ignore') for l in file.readlines()]
        datetime = None
        lon = None
        lat = None
        depth = None
        intensity = None
        for line in contain:
            line = line.strip('\n').strip('\r')
            ## Event information
            if line.find('Origin Time:') >= 0:
                datetime = line.split('Origin Time:')[1].replace('/', '-')
            if line.find('Lon:') >= 0:
                lon = float(re.findall("\d+\.\d+", line)[0])
            if line.find('Lat:') >= 0:
                lat = float(re.findall("\d+\.\d+", line)[0])
            if line.find('Depth:') >= 0:
                depth = float(re.findall("\d+\.\d+", line)[0])
            if line.find('Mag:') >= 0:
                intensity = float(re.findall("\d+\.\d+", line)[0])
            ## Station information
            if line.find('Stacode') >= 0:
                values = line.replace(' ', '').split(',')
                station_id = None
                station_name = None
                station_lon = None
                station_lat = None
                distance = None
                az = None
                pga_v = None 
                pga_ns = None 
                pga_ew = None
                for v in values:
                    col = v.split('=')[0]
                    val = v.split('=')[1]
                    if col == 'Stacode':
                        station_id = val
                    if col == 'Staname':
                        station_name = val
                    if col == 'Stalon':
                        station_lon = float(val)
                    if col == 'Stalat':
                        station_lat = float(val)
                    if col == 'Dist':
                        distance = float(val)
                    if col == 'AZ':
                        az = float(val)
                    if col == 'PGA(V)':
                        pga_v = float(val)
                    if col == 'PGA(NS)':
                        pga_ns = float(val)
                    if col == 'PGA(EW)':
                        pga_ew = float(val)
                data['event'].append(event)
                data['datetime'].append(datetime)
                data['lon'].append(lon)
                data['lat'].append(lat)
                data['depth'].append(depth)
                data['intensity'].append(intensity)
                data['station_id'].append(station_id)
                data['station_name'].append(station_name)
                data['station_lon'].append(station_lon)
                data['station_lat'].append(station_lat)
                data['distance'].append(distance)
                data['az'].append(az)
                data['pga_v'].append(pga_v)
                data['pga_ns'].append(pga_ns)
                data['pga_ew'].append(pga_ew)

100%|██████████| 58/58 [00:00<00:00, 501.44it/s]


In [6]:
# from dict to DataFrame
data = PD.DataFrame.from_dict(data)
print(data.dtypes)
data.head()

event             int64
datetime         object
lon             float64
lat             float64
depth           float64
intensity       float64
station_id       object
station_name     object
station_lon     float64
station_lat     float64
distance        float64
az              float64
pga_v           float64
pga_ns          float64
pga_ew          float64
dtype: object


Unnamed: 0,event,datetime,lon,lat,depth,intensity,station_id,station_name,station_lon,station_lat,distance,az,pga_v,pga_ns,pga_ew
0,0,2018-12-14 04:56:39,121.53,24.06,17.7,3.5,TWD,Xiulin,121.6,24.08,8.22,255.19,8.24,5.79,9.97
1,0,2018-12-14 04:56:39,121.53,24.06,17.7,3.5,ETM,Tongmen,121.49,23.97,11.26,17.81,27.06,53.78,37.68
2,0,2018-12-14 04:56:39,121.53,24.06,17.7,3.5,HWA,HualienCity,121.61,23.98,13.09,317.56,15.05,14.58,23.59
3,0,2018-12-14 04:56:39,121.53,24.06,17.7,3.5,ETL,Taroko,121.62,24.16,14.44,222.69,5.94,19.33,15.33
4,0,2018-12-14 04:56:39,121.53,24.06,17.7,3.5,ETLH,Xibao,121.48,24.21,16.64,164.09,6.89,8.04,8.54


## 3. Basic feature engineering for further analysis

In [7]:
## transform geometry projection function
def transform( data, x_col, y_col, from_crs, to_crs ):
    data_tmp = data.copy()
    data_tmp['geom'] = data_tmp.apply(lambda x: Point(x[x_col], x[y_col]), axis=1)
    geom = GPD.GeoDataFrame(data_tmp[['geom']], geometry='geom')
    geom.crs = {'init':from_crs}
    geom = geom.to_crs({'init':to_crs})
    data_tmp[x_col] = geom['geom'].x
    data_tmp[y_col] = geom['geom'].y
    return data_tmp.drop('geom', axis=1)

In [8]:
# trasform degree projection (epsg:4326) to meter projection of Taiwan region (epsg:3826)
data = transform(data, 'lon', 'lat', from_crs='epsg:4326', to_crs='epsg:3826')
data = transform(data, 'station_lon', 'station_lat', from_crs='epsg:4326', to_crs='epsg:3826')
# distance from station to event
data['d'] = NP.sqrt((data['lon'] - data['station_lon'])**2 + (data['lat'] - data['station_lat'])**2)
# maximum pga
data['pga_max'] = NP.maximum(data['pga_v'].values, data['pga_ns'].values, data['pga_ew'].values)
# total pga
data['pga'] = NP.sqrt(data['pga_v']**2 + data['pga_ns']**2 + data['pga_ew']**2)

In [9]:
data.to_csv('./data/earthquaketw.csv', index=False)

In [10]:
data.head()

Unnamed: 0,event,datetime,lon,lat,depth,intensity,station_id,station_name,station_lon,station_lat,distance,az,pga_v,pga_ns,pga_ew,d,pga_max,pga
0,0,2018-12-14 04:56:39,303898.534518,2661770.0,17.7,3.5,TWD,Xiulin,311007.912765,2664013.0,8.22,255.19,8.24,5.79,8.24,7455.018061,8.24,13.012275
1,0,2018-12-14 04:56:39,303898.534518,2661770.0,17.7,3.5,ETM,Tongmen,299865.343998,2651787.0,11.26,17.81,27.06,53.78,53.78,10766.259718,53.78,80.726826
2,0,2018-12-14 04:56:39,303898.534518,2661770.0,17.7,3.5,HWA,HualienCity,312072.754264,2652942.0,13.09,317.56,15.05,14.58,15.05,12030.63407,15.05,25.798864
3,0,2018-12-14 04:56:39,303898.534518,2661770.0,17.7,3.5,ETL,Taroko,313002.38063,2672882.0,14.44,222.69,5.94,19.33,19.33,14365.702485,19.33,27.974656
4,0,2018-12-14 04:56:39,303898.534518,2661770.0,17.7,3.5,ETLH,Xibao,298756.774779,2678364.0,16.64,164.09,6.89,8.04,8.04,17372.811019,8.04,13.294935


In [11]:
# Save station location to shapefile
stations = data.groupby('station_id')[['station_lon', 'station_lat']].max().reset_index()
stations['geom'] = stations[['station_lon', 'station_lat']].apply(lambda x: Point(x), axis=1)
stations = GPD.GeoDataFrame(stations, geometry='geom')
stations.crs={'init' :'epsg:3826'}
stations.to_file('./data/earthquaketw_stations.shp', driver='ESRI Shapefile', encoding = 'utf-8')

CPLE_NotSupported in Normalized/laundered field name: 'station_lon' to 'station_lo'
CPLE_NotSupported in Normalized/laundered field name: 'station_lat' to 'station_la'


In [13]:
# Save event location to shapefile
events = data.groupby('datetime')[['event','lon','lat','depth','intensity']].max()
events['geom'] = events[['lon', 'lat']].apply(lambda x: Point(x), axis=1)
events = GPD.GeoDataFrame(events, geometry='geom')
events.crs={'init' :'epsg:3826'}
events.to_file('./data/earthquaketw_events.shp', driver='ESRI Shapefile', encoding = 'utf-8')

In [None]:
gdf = GPD.read_file('meuse_example_data/meuse.shp')
gdf.crs = {'init':'epsg:28992'}
gdf.head()
gdf['x'] = gdf['geometry'].apply(lambda x: x.x)
gdf['y'] = gdf['geometry'].apply(lambda x: x.y)

In [None]:
gdf = gdf.to_crs({'init':'epsg:4326'})

In [None]:
gdf.to_file('./data/muese.shp', driver='ESRI Shapefile', encoding = 'utf-8')