# Uber Driver Data Analysis

(New version)

Works with SAR and Portal data.

Note that this notebook is not final, and more documentation is being added.

### Installing the required libraries with pip

In [None]:
! pip install numpy pandas portion

In [79]:
import datetime
import os
import glob
from pathlib import Path
from typing import Callable, Optional, Tuple
from itertools import product

import numpy as np
import pandas as pd
import portion as P

In [70]:
class Table(pd.DataFrame):
    """Some Python magic to be able to type hint things like df: Table['begin': datetime, 'end': datetime]"""
    __class_getitem__ = classmethod(type(list[str]))

In [74]:
IntervalTable = Table['begin': datetime, 'end': datetime]

In [71]:
data_folder = Path(os.getcwd()) / 'data'
raw_folder = data_folder / 'raw'
processed_folder = data_folder / 'processed'

In [72]:
def find_file(pattern: str, folder: Path) -> Path:
    matches = glob.glob(pattern, root_dir=folder)
    if len(matches) == 0:
        raise ValueError(f'Could not find file {pattern} in {folder}')
    elif len(matches) > 1:
        print(f'Found many matches for {pattern} in {folder}. Using the first.')
    return folder / matches[0]

In [73]:
def find_dataframe(pattern: str, folder: Path, usecols: list[str]) -> pd.DataFrame:
    file = find_file(pattern, folder)
    return pd.read_csv(file, usecols=usecols)[usecols].assign(file=file)

In [69]:
def df_to_interval(df: IntervalTable) -> P.interval:
    """Converts a DataFrame with columns 'begin' and 'end' into a Portion interval, merging entries that overlap."""
    return P.Interval(*[P.closed(row['begin'], row['end']) for row in df.to_dict('records')])


def interval_to_df(interval: P.interval) -> IntervalTable:
    """Converts a Portion interval into a dataframe with columns 'begin' and 'end'."""
    return pd.DataFrame([{'begin': begin, 'end': end} for _, begin, end, _ in P.to_data(interval)])

In [55]:
def merge_overlapping_intervals(df: IntervalTable, agg_dict: Optional[dict] = None) -> IntervalTable:
    agg_dict = agg_dict or {}

    def handle_groups(df: IntervalTable) -> IntervalTable:
        intervals = df[['begin', 'end']].sort_values(['begin', 'end'])
        group = 0
        group_end = intervals.end.iloc[0]

        def find_group(row):
            nonlocal group, group_end
            if row.begin <= group_end:
                group_end = max(row.end, group_end)
            else:
                group += 1
                group_end = row.end
            return group

        groups = intervals.apply(find_group, axis=1)
        return df.groupby(groups).agg({'begin': 'min', 'end': 'max', **agg_dict}).reset_index(drop=True)

    return df.groupby('status').apply(handle_groups).reset_index(level=1, drop=True).reset_index()

In [None]:
def union(dfl: IntervalTable, dfr: IntervalTable, agg_dict: Optional[dict] = None) -> IntervalTable:
    dfl = dfl.sort_values(['begin', 'end'])
    dfr = dfr.sort_values(['begin', 'end'])
    return dfl

In [83]:
def subtract(dfl: IntervalTable, dfr: IntervalTable) -> IntervalTable:
    res = dfl.copy()
    for (il, rl), (ir, rr) in product(dfl.iterrows(), dfr.iterrows()):
        if rr.begin <= rl.begin and rl.end <= rr.end:  # The left interval was contained in the right so we discard it
            res.drop(il, inplace=True)
        elif rr.begin <= rl.begin or rl.end <= rr.end:  # The left-right intersection is non-empty
            res.loc[il, 'begin'] = max(rl.begin, rr.begin)
            res.loc[il, 'end'] = min(rl.end, rr.end)
    return res

In [75]:
df = process_portal(raw_folder / 'portal' / 'Driver')
df

Unnamed: 0,begin,end,status,distance_km,uber_pay
0,2022-05-29 10:12:49+00:00,2022-05-29 10:18:57+00:00,P2,,
1,2022-05-29 10:18:57+00:00,2022-05-29 10:27:47+00:00,P3,2.052856,25.34
2,2022-05-28 12:04:08+00:00,2022-05-28 12:11:05+00:00,P2,,
3,2022-05-28 12:11:05+00:00,2022-05-28 12:22:45+00:00,P3,3.368586,27.32
4,2022-05-22 11:19:27+00:00,2022-05-22 11:23:43+00:00,P2,,
...,...,...,...,...,...
9973,2017-11-02 15:18:23+00:00,2017-11-02 15:28:43+00:00,P3,4.524063,14.24
9974,2017-11-02 14:55:09+00:00,2017-11-02 15:00:16+00:00,P2,,
9975,2017-11-02 15:00:16+00:00,2017-11-02 15:04:32+00:00,P3,1.132339,6.32
9976,2017-11-02 14:27:40+00:00,2017-11-02 14:36:01+00:00,P2,,


In [86]:
4989 ** 2

24890121

In [84]:
subtract(df[df.status == 'P2'], df[df.status == 'P3'])


KeyboardInterrupt



In [76]:
merge_overlapping_intervals(df, {'distance_km': 'sum', 'uber_paid': 'sum'})

CPU times: user 2.53 s, sys: 39.5 ms, total: 2.57 s
Wall time: 2.56 s


Unnamed: 0,status,begin,end,distance_km,uber_pay
0,P2,2017-11-02 14:27:40+00:00,2017-11-02 14:36:01+00:00,0.000000,0.00
1,P2,2017-11-02 14:55:09+00:00,2017-11-02 15:00:16+00:00,0.000000,0.00
2,P2,2017-11-02 15:11:51+00:00,2017-11-02 15:18:23+00:00,0.000000,0.00
3,P2,2017-11-02 15:53:50+00:00,2017-11-02 15:58:23+00:00,0.000000,0.00
4,P2,2017-11-03 12:30:20+00:00,2017-11-03 12:37:22+00:00,0.000000,0.00
...,...,...,...,...,...
9940,P3,2022-05-22 10:30:12+00:00,2022-05-22 10:38:14+00:00,3.838262,36.00
9941,P3,2022-05-22 11:04:37+00:00,2022-05-22 11:16:14+00:00,8.338381,34.58
9942,P3,2022-05-22 11:23:43+00:00,2022-05-22 11:41:01+00:00,7.053083,35.85
9943,P3,2022-05-28 12:11:05+00:00,2022-05-28 12:22:45+00:00,3.368586,27.32


## Specific to SAR data

In [5]:
def make_status_intervals(df: IntervalTable['status': str]) -> dict[str, P.interval]:
    return {s: df_to_interval(df[df.status == s]) for s in df.status.unique()}

In [6]:
def interval_merge_logic(lt: dict[str, P.interval], oo: dict[str, P.interval]) -> dict[str, pd.DataFrame]:
    P3 = lt['P3'] | oo['P3']
    P2 = (lt['P2'] | oo['P2']) - P3
    P1 = oo['P1'] - (P2 | P3)
    return {'P1': P1, 'P2': P2, 'P3': P3}

In [None]:
def main_interval_logic(lt: dict[str, P.interval], oo: dict[str, P.interval], P0_has_priority=False) -> pd.DataFrame:
    """Consider the following ordering of priorities: P3 > P2 > P1. P0_has_priority determines if P0 is on the left or right of inequalities."""
    if P0_has_priority:
        for d in [lt, oo]:
            for k in d.keys():
                if k != 'P0':
                    d[k] = d[k] - oo['P0']
    lt['P2'] = lt['P2'] - lt['P3']
    oo['P2'] = oo['P2'] - oo['P3']
    oo['P1'] = oo['P1'] - (oo['P2'] | oo['P3'])
    intervals = interval_merge_logic(lt, oo)
    return pd.concat([interval_to_df(i).assign(status=f'{s} consistent') for s, i in intervals.items()])

### One processing function per file

In [None]:
def process_lifetime_trips(folder: Path, pattern: str = '*Driver Lifetime Trips.csv') -> pd.DataFrame:
    df = find_dataframe(pattern, folder,
                        ['request_timestamp_local', 'begintrip_timestamp_local', 'dropoff_timestamp_local', 'status'])
    df = df[df.status.isin(['completed', 'fare_split'])].drop(columns='status')
    df.columns = ['request', 'begintrip', 'dropoff']
    df = time_tuples_to_periods(df, columns=df.columns,
                                extra_info=[lambda r: {'status': 'P2'}, lambda r: {'status': 'P3'}])
    return df

In [None]:
def process_on_off(folder: Path, pattern: str = '*Driver Online Offline.csv') -> pd.DataFrame:
    df = find_dataframe(pattern, folder,
                        ['begin_timestamp_local', 'end_timestamp_local', 'earner_state'])
    df.columns = ['begin', 'end', 'status']
    df = df.replace({r'\N': np.nan, 'ontrip': 'P3', 'enroute': 'P2', 'open': 'P1', 'offline': 'P0'})
    for col in ['begin', 'end']:
        df[col] = pd.to_datetime(df[col])
    return df.dropna()

In [None]:
def process_dispatches(path: str | Path) -> pd.DataFrame:
    # TODO (not finished)
    df = pd.read_csv(path, usecols=['start_timestamp_local', 'end_timestamp_local', 'dispatches', 'completed_trips',
                                    'accepts', 'rejects', 'expireds', 'driver_cancellations', 'rider_cancellations',
                                    'minutes_online', 'minutes_on_trip', 'trip_fares'])
    return df

In [None]:
def process_trip_status(path: str | Path) -> pd.DataFrame:
    pass  # TODO

In [None]:
def process_distance_traveled(path: str | Path) -> pd.DataFrame:
    pass  # TODO

In [None]:
def process_sar(
        folder: Path,
        interval_logic: Callable[[dict[str, pd.DataFrame], dict[str, pd.DataFrame]], pd.DataFrame]
) -> pd.DataFrame:
    lifetime_trips = process_lifetime_trips(folder)
    on_off = process_on_off(folder)
    return interval_logic(*[make_status_intervals(df) for df in [lifetime_trips, on_off]])

## Specific to Portal data

In [9]:
def process_portal(folder: Path):
    df = find_dataframe('*driver_lifetime_trips*.csv', folder,
                        ['Status', 'Local Request Timestamp', 'Begin Trip Local Timestamp', 'Local Dropoff Timestamp',
                         'Trip Distance (miles)', 'Duration (Seconds)', 'Local Original Fare'])
    df = df[df['Status'].isin(['completed', 'fare_split'])]
    df = time_tuples_to_periods(df, columns=['Local Request Timestamp', 'Begin Trip Local Timestamp',
                                             'Local Dropoff Timestamp'],
                                extra_info=[lambda r: {'status': 'P2'},
                                            lambda r: {'status': 'P3',
                                                       'distance_km': r['Trip Distance (miles)'] * 1.60934,
                                                       'uber_paid': r['Local Original Fare']}])
    return df

## Common to SAR and Portal data

In [11]:
def time_tuples_to_periods(
        df: pd.DataFrame,
        columns: list[str],
        extra_info: list[Callable[[pd.Series], dict]]
) -> pd.DataFrame:
    """
    Takes a dataframe where each row has N timestamps corresponding to instants of status changes,
    and converts each row into N-1 rows of periods in the corresponding status.

    Args:
        - df: a table having a number N > 1 of time-columns and L of entries.
        - columns: a list of n time-column names.
        - extra_info: a list of functions taking a row of df and outputting a dictionary of additional information. Cannot have keys 'begin' and 'end'.
    Returns:
        - periods: a table having L * (N-1) entries, each with a 'begin' and 'end' timestamp and associated information as specified by additional_info.
    Ex:
    df = pd.DataFrame([{'request_ts': '3:47 PM', 'begintrip_ts': '4:00 PM', 'dropoff_ts': '4:13 PM'}])
    columns = ['request_ts', 'begintrip_ts', 'dropoff_ts']
    extra_info = [lambda r: {'status': 'P2'}, lambda r: {'status': 'P3'}]
    time_tuples_to_periods(df, columns, extra_info)
    > begin    end      status
    > 3:47 PM  4:00 PM  P2
    > 4:00 PM  4:13 PM  P3
    """
    assert len(columns) == len(
        extra_info) + 1, f'The length of additional information should correspond to the number of generated periods (N-1).'
    periods = pd.DataFrame(df.apply(
        lambda r: [{'begin': r[b], 'end': r[e], **d(r)} for b, e, d in zip(columns, columns[1:], extra_info)],
        axis=1
    ).explode().to_list())
    for col in ['begin', 'end']:
        periods[col] = pd.to_datetime(periods[col])
    return periods

In [None]:
def split_in_half_days(df: pd.DataFrame) -> pd.DataFrame:
    """Makes sure that intervals spanning many days or many morning/afternoon periods are split"""

    # TODO write doc and give a better name
    def rec(begin: datetime.datetime, end: datetime.datetime, **rest) -> list[dict]:
        rows = []
        if begin.day != end.day:
            rows.append({'begin': begin, 'end': begin.replace(hour=23, minute=59, second=59), **rest})
            for days in range(end.day - begin.day - 1):
                inbetween = (begin + datetime.timedelta(days=days))
                rows.append({'begin': inbetween.replace(hour=0, minute=0, second=0),
                             'end': inbetween.replace(hour=23, minute=59, second=59), **rest})
            rows.append({'begin': end.replace(hour=0, minute=0, second=0), 'end': end, **rest})
            return [e for r in rows for e in rec(**r)]
        elif begin.hour < 12 <= end.hour:
            rows.append({'begin': begin, 'end': end.replace(hour=11, minute=59, second=59), **rest})
            rows.append({'begin': end.replace(hour=12, minute=0, second=0), 'end': end, **rest})
            return rows
        else:
            return [{'begin': begin, 'end': end, **rest}]

    return pd.DataFrame([e for d in df.to_dict('records') for e in rec(**d)])

In [None]:
def pipeline(
        periods: pd.DataFrame,
        interval_logic: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
        filtering_logic: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
        name: str = 'time_per_status',
        facets: list[str] = ('duration',)
) -> Tuple[pd.DataFrame, ...]:
    # Make sure that each interval is contained within a single 12-hour time period (AM or PM) by splitting when necessary
    splitted = split_in_half_days(periods)

    # Now we can compute all of these time properties since they will be the same for begin and end
    splitted['date'] = splitted.end.dt.date
    splitted['day_of_week'] = splitted.end.dt.day_name()
    splitted['day_type'] = (splitted.end.dt.weekday < 5).replace({True: 'week day', False: 'weekend'})
    splitted['time_of_day'] = (splitted.end.dt.hour < 12).replace({True: 'AM', False: 'PM'})
    splitted['week'] = splitted.end.dt.isocalendar().week
    splitted['year'] = splitted.end.dt.year
    splitted['duration'] = splitted.end - splitted.begin

    # Group entries by day and time period
    daily = splitted.groupby(['date', 'year', 'week', 'day_of_week',
                              'day_type', 'time_of_day', 'status']).agg({f: 'sum' for f in facets}).reset_index()
    daily['date'] = pd.to_datetime(daily.date)

    # Filter the data if a filtering function is specified
    if filtering_logic is not None:
        filtered = filtering_logic(daily)
        filtered['status'] = filtered.status + '|filtered'
        daily = pd.concat([daily, filtered])

    processed_folder.mkdir(parents=True, exist_ok=True)
    daily.to_csv(processed_folder / f'{name}_daily.csv', index=False)
    weekly = daily.groupby(['year', 'week', 'status']).agg({f: 'sum' for f in facets}).reset_index()
    weekly.to_csv(processed_folder / f'{name}_weekly.csv', index=False)
    yearly = weekly.groupby(['year', 'status']).agg({f: 'sum' for f in facets}).reset_index()
    yearly.to_csv(processed_folder / f'{name}_yearly.csv', index=False)
    total = yearly.groupby(['status']).agg({f: 'sum' for f in facets}).reset_index()
    total.to_csv(processed_folder / f'{name}_total.csv', index=False)
    return daily, weekly, yearly, total

## Tying everything together

In [None]:
def guillaume_filtering_logic(
        daily: pd.DataFrame,
        percentage_df_path: Optional[str | Path] = data_folder / 'percentage.csv'
) -> pd.DataFrame:
    # First, weight P1 times based on the percentage that Guillaume was working for Uber on that month
    percentage = pd.read_csv(percentage_df_path)
    percentage['Uber'] /= 100
    P1 = daily.loc[daily.status.str.contains('P1')].copy()
    for i, row in percentage.iterrows():
        if row['Uber'] == 0 and P1[(P1.date.dt.year == row.year) & (
                P1.date.dt.month == row.month)].duration.sum() > datetime.timedelta(0):
            print(
                f'bad specification for {row.year}/{row.month}: activity found even though specified percentage was 0')
        P1['duration'] = np.where(
            (P1.date.dt.year == row.year) & (P1.date.dt.month == row.month),
            P1['duration'] * row['Uber'],
            P1['duration'])
    # Second, remove all morning weekday entries when Guillaume was working for IMAD, except for the specific dates below
    dates_to_keep = [datetime.date(2020, 11, 26),
                     *pd.date_range(datetime.date(2020, 12, 21), datetime.date(2020, 12, 25)).values,
                     *pd.date_range(datetime.date(2021, 2, 1), datetime.date(2021, 2, 12)).values,
                     *pd.date_range(datetime.date(2021, 8, 16), datetime.date(2021, 8, 28)).values,
                     *pd.date_range(datetime.date(2021, 9, 20), datetime.date(2021, 10, 3)).values,
                     *pd.date_range(datetime.date(2021, 11, 25), datetime.date(2021, 12, 12)).values,
                     *pd.date_range(datetime.date(2022, 4, 25), datetime.date(2022, 5, 13)).values]
    P1.drop(P1[(P1.time_of_day == 'AM') &
               (P1.day_type == 'week day') &
               ~P1.date.apply(lambda d: d.date()).isin(dates_to_keep)].index,
            inplace=True)
    return P1

You can find SAR samples in the KDrive at:
- old: `hestiaai /Common documents/HestiaLabs/PDIO- Data/Driver Data/Guillaume data/Lemoine Guillaume/Lemoine_SAR_06.08.2022.zip`.
- new: `PersonalData.IO /Lemoine_12102022-20221110T164542Z-001.zip`

In [None]:
_ = pipeline(process_sar(raw_folder / 'new', interval_logic=main_interval_logic),
             name='sar',
             filtering_logic=guillaume_filtering_logic,
             facets=['duration'])

A Portal sample can be found on our KDrive at `hestiaai /Common documents/HestiaLabs/PDIO- Data/Driver Data/Guillaume data/Lemoine Guillaume/202207/Uber Data F0699B53.zip`

In [None]:
_ = pipeline(process_portal(raw_folder / 'portal' / 'Driver'),
             name='portal',
             filtering_logic=guillaume_filtering_logic,
             facets=['duration', 'distance_km', 'uber_paid'])