## Dependencies and Query Engine

In [1]:
import pandas as pd
from sqlalchemy import create_engine

pd.options.display.max_columns = 50

db_con = create_engine(f"postgresql://root:root@localhost:5432/ny_taxi")
db_con.connect()

<sqlalchemy.engine.base.Connection at 0x7f55626b8a00>

In [2]:
def sql_query(query, con=db_con):
    return pd.read_sql(query, con=con)

def sql_data_manipulation(query, con=db_con):
    return con.execute(query)

## Calculate trip duration

### Extract days, hours, minutes and seconds from datetime difference

In [3]:
query = """
SELECT 
    trip_id,
    tpep_pickup_datetime, tpep_dropoff_datetime,
    (tpep_dropoff_datetime - tpep_pickup_datetime) AS date_diff,
    EXTRACT(day FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS days,
    EXTRACT(hour FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS hours,
    EXTRACT(minute FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS minutes,
    EXTRACT(second FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS seconds
FROM 
    yellow_taxi_data AS t 
    
LIMIT 2
"""
sql_query(query)

Unnamed: 0,trip_id,tpep_pickup_datetime,tpep_dropoff_datetime,date_diff,days,hours,minutes,seconds
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,0 days 00:06:02,0.0,0.0,6.0,2.0
1,2,2021-01-01 00:51:20,2021-01-01 00:52:19,0 days 00:00:59,0.0,0.0,0.0,59.0


### Add duration column to table

In [5]:
def add_duration_column():
    query = """
    ALTER TABLE yellow_taxi_data 
        ADD COLUMN duration DECIMAL
    """
    sql_data_manipulation(query)
    
# run only once and don't forget to uncomment if the duration column does not exists
#add_duration_column()

In [6]:
def append_duration():
    query = """
    UPDATE yellow_taxi_data AS ytd SET duration = (
        SELECT 
            (days * 24 * 60) + (hours * 60) + minutes + (seconds / 60) AS duration
        FROM (
            SELECT 
                trip_id,
                (tpep_dropoff_datetime - tpep_pickup_datetime) AS date_diff,
                EXTRACT(day FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS days,
                EXTRACT(hour FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS hours,
                EXTRACT(minute FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS minutes,
                EXTRACT(second FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) AS seconds
            FROM 
                yellow_taxi_data
        ) AS diff
        WHERE 
            ytd.trip_id = diff.trip_id
    )

    """
    sql_data_manipulation(query)

append_duration()

In [7]:
query = """
SELECT 
    trip_id, 
    (tpep_dropoff_datetime - tpep_pickup_datetime) AS date_diff, 
    duration 
FROM 
    yellow_taxi_data 
LIMIT 10;
"""
sql_query(query)

Unnamed: 0,trip_id,date_diff,duration
0,1,0 days 00:06:02,6.033333
1,2,0 days 00:00:59,0.983333
2,3,0 days 00:27:36,27.6
3,4,0 days 00:15:13,15.216667
4,5,0 days 00:16:32,16.533333
5,6,0 days 00:08:01,8.016667
6,7,0 days 00:17:00,17.0
7,8,0 days 00:18:05,18.083333
8,9,0 days 00:20:57,20.95
9,10,0 days 00:13:34,13.566667
