In [1]:
from pyflink.table import (
    EnvironmentSettings, TableEnvironment, 
    CsvTableSource, DataTypes, 
    Table, TableSource
)

from pyflink.table.expressions import col

### Setup Flink Environment

In [2]:
env_settings = EnvironmentSettings.in_batch_mode()

In [3]:
t_env = TableEnvironment.create(env_settings)

### FHV Dataset

In [4]:
fhv_dataset_path = "/Users/iobruno/Vault/datasets/fhv/fhv_tripdata_2019-01.csv.gz"

In [5]:
fhv_ts: TableSource = CsvTableSource(
    source_path=fhv_dataset_path,
    field_names=[
        'dispatching_base_num', 
        'pickup_datetime',
        'dropoff_datetime',
        'pickup_location_id',
        'dropoff_location_id',
        'sr_flag',
        'affiliated_base_number'
    ],
    field_types=[
        DataTypes.STRING(),
        DataTypes.TIMESTAMP(),
        DataTypes.TIMESTAMP(),
        DataTypes.INT(),
        DataTypes.INT(),
        DataTypes.INT(),
        DataTypes.STRING()
    ],
    ignore_first_line=True
)

In [6]:
"""
    Registers as temporary_view, attempting to first drop it to prevent
    errors on table name collisions
"""
t_env.drop_temporary_view('fhv')
t_env.register_table('fhv', t_env.from_table_source(fhv_ts))

### Taxi Lookup Zones Dataset

In [7]:
zones_dataset_path = "/Users/iobruno/Vault/datasets/zones/taxi_zone_lookup.csv"

In [8]:
zones_ts: TableSource = CsvTableSource(
    source_path=zones_dataset_path,
    field_names=[
        'location_id', 
        'borough', 
        'zone', 
        'service_zone'
    ],
    field_types=[
        DataTypes.INT(),
        DataTypes.STRING(),
        DataTypes.STRING(),
        DataTypes.STRING()                   
    ],
    ignore_first_line=True
)

In [9]:
"""
    Registers as temporary_view, attempting to first drop it to prevent
    errors on table name collisions
"""
t_env.drop_temporary_view('zones')
t_env.register_table('zones', t_env.from_table_source(zones_ts))

### Flink SQL - Join Flink Tables

In [10]:
tbl: Table = t_env.sql_query("""    
    SELECT 
        f.dispatching_base_num,
        f.affiliated_base_number,

        -- Pickup Location
        f.pickup_datetime,
        pu.zone as pickup_zone,
        pu.service_zone as pickup_service_zone,
        
        -- Dropoff Location
        f.dropoff_datetime,
        do.zone as dropoff_zone,
        do.service_zone as dropoff_service_zone
        
    FROM fhv f
    INNER JOIN zones pu ON f.pickup_location_id  = pu.location_id
    INNER JOIN zones do ON f.dropoff_location_id = do.location_id 
""")

In [11]:
tbl.fetch(5).execute().print()

+--------------------------------+--------------------------------+----------------------------+--------------------------------+--------------------------------+----------------------------+--------------------------------+--------------------------------+
|           dispatching_base_num |         affiliated_base_number |            pickup_datetime |                    pickup_zone |            pickup_service_zone |           dropoff_datetime |                   dropoff_zone |           dropoff_service_zone |
+--------------------------------+--------------------------------+----------------------------+--------------------------------+--------------------------------+----------------------------+--------------------------------+--------------------------------+
|                         B02877 |                                | 2019-01-31 23:58:13.000000 |                    JFK Airport |                       Airports | 2019-02-01 00:22:31.000000 |        Allerton/Pelham Gardens |  