In [1]:
import os
os.getcwd()
os.listdir()

['extracted_airports-00000-of-00001',
 'beam-temp-extracted_airports-a30c41003b1511eaae86e0accb680bbc',
 'beam-temp-airports_with_tz-6e4163683b1211eab37de0accb680bbc',
 'df05.py',
 'df01.py',
 'note.ipynb',
 'simulate.py',
 'airports.csv.gz',
 'df04.py',
 'beam-temp-airports_with_tz-94aedffa3b1511eaaae3e0accb680bbc',
 'df03.py',
 'setup.py',
 'df06.py',
 'df02.py',
 'airports_with_tz-00000-of-00001',
 '.ipynb_checkpoints',
 'dataflow.py',
 'install_packages.sh',
 'beam-temp-airports_with_tz-a5f62eca3b1111ea8987e0accb680bbc']

In [2]:
import apache_beam as beam
import csv

### df01

In [9]:
with beam.Pipeline('DirectRunner') as pipeline:

    airports = (pipeline
        | beam.io.ReadFromText('airports.csv.gz')
        | beam.Map(lambda line: next(csv.reader([line])))
        | beam.Map(lambda fields: (fields[0], fields[21], fields[26]))
    )

    airports | beam.Map(lambda airport_data: '{}'.format(','.join(airport_data))) \
        | beam.io.textio.WriteToText('results/extracted_airports')

    pipeline.run()




### df02

In [25]:

def addtimezone(lat, lon):
    try:
        import timezonefinder
        tf = timezonefinder.TimezoneFinder()
        tz = tf.timezone_at(lng=float(lon), lat=float(lat)) # throws ValueError
        if tz is None:
            tz = 'UTC'
        return (lat, lon, tz)
    except ValueError:
        return lat, lon, 'TIMEZONE'

In [26]:
print(addtimezone(58.10944444,-152.90666667))

(58.10944444, -152.90666667, 'America/Anchorage')


In [34]:
with beam.Pipeline('DirectRunner') as pipeline:

  airports = (pipeline
     | beam.io.ReadFromText('airports.csv.gz')
     | beam.Map(lambda line: next(csv.reader([line])))
     | beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26])))
  )

  airports | beam.Map(lambda airport : '{},{}'.format(airport[0],','.join(airport[1])) )| beam.io.textio.WriteToText('airports_with_tz')

  pipeline.run()


### df03

In [51]:

def addtimezone(lat, lon):
    try:
        import timezonefinder
        tf = timezonefinder.TimezoneFinder()
        tz = tf.timezone_at(lng=float(lon), lat=float(lat)) # throws ValueError
        if tz is None:
            tz = 'UTC'
        return (lat, lon, tz)
    except ValueError:
        return lat, lon, 'TIMEZONE' # これがヘッダーになる

def as_utc(date, hhmm, tzone):
    try:
        if len(hhmm) > 0 and tzone is not None:
            import datetime, pytz
            loc_tz = pytz.timezone(tzone)
            loc_dt = loc_tz.localize(datetime.datetime.strptime(date, '%Y-%m-%d'), is_dst=False)
            loc_dt += datetime.timedelta(hours=int(hhmm[:2]), minutes=int(hhmm[2:]))

            utc_dt = loc_dt.astimezone(pytz.utc)
            return utc_dt.strftime('%Y-%m-%d %H%M%S')
        else:
            return ""
    except ValueError as e:
        print ('{} {} {}'.format(date, hhmm, tzone))

        raise e

# 引数: 読み込んだフライトデータ1行分と全ての空港のタイムゾーン情報を含むディクショナリ
def tz_correct(line, airport_timezones):
    fields = line.split(',')
    if fields[0] != 'FL_DATE' and len(fields) == 27:
        # convert all times to UTC
        dep_airport_id = fields[6]
        arr_airport_id = fields[10]
        # (lat, lon, tz)
        dep_timezone = airport_timezones[dep_airport_id][2]
        arr_timezone = airport_timezones[arr_airport_id][2]

        for f in [13, 14, 17]: # crsdeptime, deptime, whellsoff
            fields[f] = as_utc(fields[0], [f], dep_timezone)
        for f in [18, 20, 21]: # wheelson, crsarrtime, arrtime
            fields[f] = as_utc(fields[0], [f], arr_timezone)

        yield ','.join(fields)


In [53]:
with beam.Pipeline('DirectRunner') as pipeline:

    airports = (pipeline
        | 'airports:read' >> beam.io.ReadFromText('airports.csv.gz')
        | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line])))
        | 'airports:tz' >> beam.Map(lambda fields: (
                 fields[0], addtimezone(fields[21], fields[26])
            ))
    )

    flights = (pipeline
        | 'flights:read' >> beam.io.ReadFromText('201501_part_10.csv')
        | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports))
    ) 

    flights | beam.io.textio.WriteToText('results/all_flights')

    pipeline.run()


TypeError: int() argument must be a string, a bytes-like object or a number, not 'list' [while running 'flights:tzcorr']

### メモ

In [82]:
airports = """AIRPORT_SEQ_ID,LATITUDE,LONGITUDE
1000101,6,54
1000301,32,4
1000401,5,10
1000501,34,11
1000601,44,52
1000701,33,6
1000801,9,49
1000901,21,25
1001001,17,42"""

airport_timezones = {}
for a in airports.split('\n'):
    a_list = a.split(',')
    airport_timezones[a_list[0]] = addtimezone(a_list[1], a_list[2])

In [83]:
flights="""FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE
2015-01-01,B6,20409,B6,1233,13204,1320402,31454,MCO,14843,1484304,34819,SJU,1725,1714,-11.00,19.00,1733,2055,5.00,2110,2100,-10.00,0.00,,0.00,1189.00
2015-01-01,B6,20409,B6,1234,14843,1484304,34819,SJU,13204,1320402,31454,MCO,2150,2139,-11.00,12.00,2151,2337,15.00,2351,2352,1.00,0.00,,0.00,1189.00
2015-01-01,B6,20409,B6,1236,14843,1484304,34819,SJU,15024,1502403,34945,STT,0748,0737,-11.00,10.00,0747,0804,2.00,0822,0806,-16.00,0.00,,0.00,68.00
2015-01-01,B6,20409,B6,1237,10721,1072102,30721,BOS,11433,1143302,31295,DTW,1555,1602,7.00,14.00,1616,1813,5.00,1818,1818,0.00,0.00,,0.00,632.00
2015-01-01,B6,20409,B6,1251,12197,1219702,31703,HPN,14635,1463502,31714,RSW,0730,0726,-4.00,9.00,0735,1026,7.00,1049,1033,-16.00,0.00,,0.00,1102.00
2015-01-01,B6,20409,B6,1252,14635,1463502,31714,RSW,12197,1219702,31703,HPN,1125,1119,-6.00,8.00,1127,1356,3.00,1410,1359,-11.00,0.00,,0.00,1102.00
2015-01-01,B6,20409,B6,1253,11697,1169703,32467,FLL,14843,1484304,34819,SJU,1850,1848,-2.00,15.00,1903,2205,7.00,2218,2212,-6.00,0.00,,0.00,1046.00
2015-01-01,B6,20409,B6,1254,14027,1402702,34027,PBI,12478,1247802,31703,JFK,1653,1655,2.00,15.00,1710,1935,11.00,1938,1946,8.00,0.00,,0.00,1028.00
2015-01-01,B6,20409,B6,1256,12264,1226402,30852,IAD,10721,1072102,30721,BOS,2059,2050,-9.00,10.00,2100,2200,4.00,2219,2204,-15.00,0.00,,0.00,413.00"""

In [84]:
line = flights.split('\n')[1]
fields = line.split(',')
dep_airport_id = fields[6]
arr_airport_id = fields[10]

In [85]:
airport_timezones

{'AIRPORT_SEQ_ID': ('LATITUDE', 'LONGITUDE', 'TIMEZONE'),
 '1000101': ('6', '54', 'UTC'),
 '1000301': ('32', '4', 'Africa/Algiers'),
 '1000401': ('5', '10', 'Africa/Douala'),
 '1000501': ('34', '11', 'Africa/Tunis'),
 '1000601': ('44', '52', 'Asia/Aqtau'),
 '1000701': ('33', '6', 'Africa/Algiers'),
 '1000801': ('9', '49', 'Africa/Mogadishu'),
 '1000901': ('21', '25', 'Africa/Tripoli'),
 '1001001': ('17', '42', 'Asia/Riyadh')}

In [87]:
dep_airport_id

'1320402'

In [88]:
arr_airport_id

'1484304'

In [92]:
# 一時的に書き換え
dep_airport_id = '1000101'
arr_airport_id = '1000901'

In [123]:
airport_timezones[dep_airport_id]

('6', '54', 'UTC')

In [93]:
dep_timezone = airport_timezones[dep_airport_id][2]
arr_timezone = airport_timezones[arr_airport_id][2]

In [95]:
dep_timezone
arr_timezone

'Africa/Tripoli'

In [96]:
fields[0]

'2015-01-01'

In [98]:
fields[13], fields[14], fields[17], fields[18], fields[20], fields[21]  

('1725', '1714', '1733', '2055', '2110', '2100')

In [111]:
import datetime, pytz
date = fields[0]
tzone = arr_timezone
hhmm = fields[13]

In [112]:
pytz.timezone(tzone)

<DstTzInfo 'Africa/Tripoli' LMT+0:53:00 STD>

In [125]:
loc_tz = pytz.timezone(tzone)
loc_dt = loc_tz.localize(datetime.datetime.strptime(date, '%Y-%m-%d'), is_dst=False)
loc_dt += datetime.timedelta(hours=int(hhmm[:2]), minutes=int(hhmm[2:]))
print(loc_dt.utcoffset().total_seconds())
print(str(loc_dt.utcoffset().total_seconds()))
print(loc_dt.strftime('%Y-%m-%d %H:%M:%S'))
utc_dt = loc_dt.astimezone(pytz.utc)
print(utc_dt.strftime('%Y-%m-%d %H:%M:%S'))


7200.0
7200.0
2015-01-01 17:25:00
2015-01-01 15:25:00


In [114]:
loc_tz.localize(datetime.datetime.strptime(date, '%Y-%m-%d'), is_dst=False)

datetime.datetime(2015, 1, 1, 0, 0, tzinfo=<DstTzInfo 'Africa/Tripoli' EET+2:00:00 STD>)

In [121]:
meta = flights.split('\n')[0]
meta_fields = meta.split(',')

In [132]:
meta_fields

['FL_DATE',
 'UNIQUE_CARRIER',
 'AIRLINE_ID',
 'CARRIER',
 'FL_NUM',
 'ORIGIN_AIRPORT_ID',
 'ORIGIN_AIRPORT_SEQ_ID',
 'ORIGIN_CITY_MARKET_ID',
 'ORIGIN',
 'DEST_AIRPORT_ID',
 'DEST_AIRPORT_SEQ_ID',
 'DEST_CITY_MARKET_ID',
 'DEST',
 'CRS_DEP_TIME',
 'DEP_TIME',
 'DEP_DELAY',
 'TAXI_OUT',
 'WHEELS_OFF',
 'WHEELS_ON',
 'TAXI_IN',
 'CRS_ARR_TIME',
 'ARR_TIME',
 'ARR_DELAY',
 'CANCELLED',
 'CANCELLATION_CODE',
 'DIVERTED',
 'DISTANCE']

In [127]:
for f in [16, 17, 18, 19, 21, 22, 25]:
  print (meta_fields[f])

TAXI_OUT
WHEELS_OFF
WHEELS_ON
TAXI_IN
ARR_TIME
ARR_DELAY
DIVERTED


In [152]:
def get_next_event(fields):
    if len(fields[14]) > 0: # DEP_TIME
        event = list(fields) # copy
        event.extend(['departed', fields[14]])
        # TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,ARR_TIME,ARR_DELAY,DIVERTED
        for f in [16, 17, 18, 19, 21, 22, 25]:
            event[f] = '' # Not knowable at departure time
        yield event
    if len(fields[21]) > 0: # ARR_TIME
        event = list(fields)
        event.extend(['arrived', fields[21]])
        print (event)
        yield event


In [153]:
hoge = get_next_event(fields)

In [158]:
for h in hoge:
    print(','.join(h))
    print(h)

In [159]:
','.join(hoge)

''

In [178]:
fields[0] = '2015-01-01'
fields1 = fields[:]

In [179]:
fields1[0] = 'Oops'

In [180]:
print (fields[0], fields1[0])

2015-01-01 Oops


In [182]:
print ('2015-01-01,B6,20409,B6,1233,13204,1320402,31454,MCO,14843,1484304,34819,SJU,2015-01-01 22:25:00,2015-01-01 22:14:00,-11.00,19.00,2015-01-01 22:33:00,2015-01-02 00:55:00,5.00,2015-01-02 01:10:00,2015-01-02 01:00:00,-10.00,0.00,,0.00,1189.00,28.42944444,-81.30888889,-18000.0,18.43944444,-66.00222222,-14400.0'.split(','))

['2015-01-01', 'B6', '20409', 'B6', '1233', '13204', '1320402', '31454', 'MCO', '14843', '1484304', '34819', 'SJU', '2015-01-01 22:25:00', '2015-01-01 22:14:00', '-11.00', '19.00', '2015-01-01 22:33:00', '2015-01-02 00:55:00', '5.00', '2015-01-02 01:10:00', '2015-01-02 01:00:00', '-10.00', '0.00', '', '0.00', '1189.00', '28.42944444', '-81.30888889', '-18000.0', '18.43944444', '-66.00222222', '-14400.0']


In [183]:
header = 'FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'.split(',')

featdict = {}
for name, value in zip(header, fields):
    featdict[name] = value
featdict['EVENT_DATA'] = ','.join(fields)


In [186]:
featdict

{'FL_DATE': '2015-01-01',
 'UNIQUE_CARRIER': 'B6',
 'AIRLINE_ID': '20409',
 'CARRIER': 'B6',
 'FL_NUM': '1233',
 'ORIGIN_AIRPORT_ID': '13204',
 'ORIGIN_AIRPORT_SEQ_ID': '1320402',
 'ORIGIN_CITY_MARKET_ID': '31454',
 'ORIGIN': 'MCO',
 'DEST_AIRPORT_ID': '14843',
 'DEST_AIRPORT_SEQ_ID': '1484304',
 'DEST_CITY_MARKET_ID': '34819',
 'DEST': 'SJU',
 'CRS_DEP_TIME': '1725',
 'DEP_TIME': '1714',
 'DEP_DELAY': '-11.00',
 'TAXI_OUT': '19.00',
 'WHEELS_OFF': '1733',
 'WHEELS_ON': '2055',
 'TAXI_IN': '5.00',
 'CRS_ARR_TIME': '2110',
 'ARR_TIME': '2100',
 'ARR_DELAY': '-10.00',
 'CANCELLED': '0.00',
 'CANCELLATION_CODE': '',
 'DIVERTED': '0.00',
 'DISTANCE': '1189.00',
 'EVENT_DATA': '2015-01-01,B6,20409,B6,1233,13204,1320402,31454,MCO,14843,1484304,34819,SJU,1725,1714,-11.00,19.00,1733,2055,5.00,2110,2100,-10.00,0.00,,0.00,1189.00'}

In [188]:
header = 'FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'.split(',')

featdict = {}
for name, value in zip(header, fields):
    print(name, value)

TypeError: 'list' object cannot be interpreted as an integer