# Uber Driver Data Analysis

(New version)

Works with SAR and Portal data.

Note that this notebook is not final, and more documentation is being added.

#### [Optional] Installing the required libraries with pip

In [1]:
! pip install numpy pandas portion



In [2]:
import datetime
import glob
import os
from pathlib import Path
from typing import Callable, Optional, Tuple, Any

import numpy as np
import pandas as pd

In [3]:
class Table(pd.DataFrame):
    """Some Python magic to be able to type hint things like
    df: Table['begin': datetime, 'end': datetime]"""

    def __class_getitem__(cls, _):
        return list[str]

In [4]:
Timestamp = datetime.datetime

In [5]:
PeriodTable = Table['begin': Timestamp, 'end': Timestamp]

In [6]:
data_folder = Path(os.getcwd()) / 'data'

In [7]:
def find_file(pattern: str, folder: Path) -> Path:
    """
    Looks for a file matching the given pattern inside the given folder
    :param pattern: a glob file pattern
    :param folder: a path leading to a folder where the pattern should be applied
    :return: a path to the first file matching the pattern, or a value error if none.
    """
    matches = glob.glob(pattern, root_dir=folder)
    if len(matches) == 0:
        raise ValueError(f'Could not find file {pattern} in {folder}')
    elif len(matches) > 1:
        print(f'Found many matches for {pattern} in {folder}. Using the first.')
    return folder / matches[0]

In [8]:
def find_table(pattern: str, folder: Path, usecols: Optional[list[str]] = None) -> Table:
    """
    Looks for a csv-like file whose name matches the pattern and is located inside folder, using only the specified columns.
    If found, reads it and assigns the file name as a column.
    :param pattern: a glob file pattern
    :param folder: a path leading to a folder where the pattern should be applied
    :param usecols: an optional list of columns present in the data file, only they will be loaded
    :return: a Table (a.k.a. pandas DataFrame) having the specified columns and the file name as a column.
    """
    file = find_file(pattern, folder)
    table = pd.read_csv(file, usecols=usecols)
    if usecols is not None:
        table = table[usecols]
    return table.assign(file=file.name)

In [9]:
def find_date_range(pattern: str, folder: Path, date_cols: list[str]) -> (Timestamp, Timestamp):
    """

    :param pattern: a glob file pattern
    :param folder: a path leading to a folder where the pattern should be applied
    :param date_cols: a list of column names that are dates
    :return: the minimum and maximum observed dates
    Usage:
    find_date_range('*Driver Trip Status.csv', data_folder / 'Brice' / 'raw' / 'SAR',
                    ['begin_timestamp_local', 'end_timestamp_local'])
    """
    df = find_table(pattern, folder, usecols=date_cols)
    for c in date_cols:
        df[c] = pd.to_datetime(df[c])
    return df[date_cols].min().min(), df[date_cols].max().max()

Currently unused code

def df_to_interval(df: PeriodTable) -> P.interval:
    """Converts a DataFrame with columns 'begin' and 'end' into a Portion interval, merging entries that overlap."""
    return P.Interval(*[P.closed(row['begin'], row['end']) for row in df.to_dict('records')])


def interval_to_df(interval: P.interval) -> PeriodTable:
    """Converts a Portion interval into a dataframe with columns 'begin' and 'end'."""
    return pd.DataFrame([{'begin': begin, 'end': end} for _, begin, end, _ in P.to_data(interval)])

def make_status_intervals(df: PeriodTable) -> dict[str, P.interval]:
    return {s: df_to_interval(df[df.status == s]) for s in df.status.unique()}

def interval_merge_logic(lt: dict[str, P.interval], oo: dict[str, P.interval]) -> dict[str, P.interval]:
    P3 = lt['P3'] | oo['P3']
    P2 = (lt['P2'] | oo['P2']) - P3
    P1 = oo['P1'] - (P2 | P3)
    return {'P1': P1, 'P2': P2, 'P3': P3}

def main_interval_logic(lt: dict[str, P.interval], oo: dict[str, P.interval], P0_has_priority=False) -> Table:
    """Consider the following ordering of priorities: P3 > P2 > P1. P0_has_priority determines if P0 is on the left or right of inequalities."""
    if P0_has_priority:
        for d in [lt, oo]:
            for k in d.keys():
                if k != 'P0':
                    d[k] = d[k] - oo['P0']
    lt['P2'] = lt['P2'] - lt['P3']
    oo['P2'] = oo['P2'] - oo['P3']
    oo['P1'] = oo['P1'] - (oo['P2'] | oo['P3'])
    intervals = interval_merge_logic(lt, oo)
    return pd.concat([interval_to_df(i).assign(status=f'{s} consistent') for s, i in intervals.items()])

In [10]:
def time_tuples_to_periods(
        table: Table['t1': Timestamp, 't2': Timestamp, 't3': Timestamp],
        columns: list[str],
        extra_info: list[Callable[[pd.Series], dict]]
) -> pd.DataFrame:
    """
    Takes a dataframe where each row has N timestamps corresponding to instants of status changes,
    and converts each row into N-1 rows of periods in the corresponding status.

    :param: table: a table having a number N > 1 of time-columns and L of entries.
    :param: columns: a list of n time-column names present in {table}.
    :param: extra_info: a list of functions taking a row of df and outputting a dictionary of additional information. Cannot have keys 'begin' and 'end'.
    :return: periods: a table having L * (N-1) entries, each with a 'begin' and 'end' timestamp and associated information as specified by additional_info.
    Usage:
    df = pd.DataFrame([{'request_ts': '3:47 PM', 'begintrip_ts': '4:00 PM', 'dropoff_ts': '4:13 PM'}])
    columns = ['request_ts', 'begintrip_ts', 'dropoff_ts']
    extra_info = [lambda r: {'status': 'P2'}, lambda r: {'status': 'P3'}]
    time_tuples_to_periods(df, columns, extra_info)
    > begin    end      status
    > 3:47 PM  4:00 PM  P2
    > 4:00 PM  4:13 PM  P3
    """
    assert len(columns) == len(
        extra_info) + 1, f'The length of additional information should correspond to the number of generated periods (N-1).'
    periods = pd.DataFrame(table.apply(
        lambda r: [{'begin': r[b], 'end': r[e], **d(r)} for b, e, d in zip(columns, columns[1:], extra_info)],
        axis=1
    ).explode().to_list())
    for col in ['begin', 'end']:
        periods[col] = pd.to_datetime(periods[col])
    return periods

In [11]:
def merge_overlapping_intervals(table: PeriodTable, agg_dict: Optional[dict] = None) -> PeriodTable:
    """
    Groups the given table by status, then sorts all intervals by begin datetime and merges overlapping entries "efficiently".
    :param table: the table whose intervals should be merged
    :param agg_dict: maps each row to the operation that should be applied to combine interval attributes when merging them
    :return: a table with merged intervals per status
    """
    agg_dict = agg_dict or {c: 'sum' for c in table.columns if c not in ['begin', 'end', 'status']}

    def handle_groups(group: PeriodTable) -> PeriodTable:
        intervals = group[['begin', 'end']].sort_values(['begin', 'end'])
        group_number = 0
        group_end = intervals.end.iloc[0]

        def find_group(row):
            nonlocal group_number, group_end
            if row.begin <= group_end:
                group_end = max(row.end, group_end)
            else:
                group_number += 1
                group_end = row.end
            return group_number

        groups = intervals.apply(find_group, axis=1)
        return table.groupby(groups).agg({'begin': 'min', 'end': 'max', **agg_dict}).reset_index(drop=True)

    return table.groupby('status').apply(handle_groups).reset_index(level=1, drop=True).reset_index()

In [12]:
def mile2km(n_miles: float) -> float:
    return n_miles * 1.609344

#### SAR preprocessing logic

In [13]:
def load_lifetime_trips(folder: Path, pattern: str = '*Driver Lifetime Trips.csv') -> PeriodTable:
    table = find_table(pattern, folder,
                       ['request_timestamp_local', 'begintrip_timestamp_local', 'dropoff_timestamp_local', 'status',
                        'request_to_begin_distance_miles', 'trip_distance_miles', 'original_fare_local'])
    table = table[table.status == 'completed'].drop(columns='status')
    table.replace({r'\N': np.nan}, inplace=True)
    for col in ['request_to_begin_distance_miles', 'original_fare_local']:
        table[col] = table[col].astype(float)
    table = time_tuples_to_periods(table,
                                   columns=['request_timestamp_local', 'begintrip_timestamp_local',
                                            'dropoff_timestamp_local'],
                                   extra_info=[lambda r: {'status': 'P2',
                                                          'distance_km': mile2km(r['request_to_begin_distance_miles']),
                                                          'file': r['file']},
                                               lambda r: {'status': 'P3',
                                                          'distance_km': mile2km(r['trip_distance_miles']),
                                                          'uber_paid': r['original_fare_local'],
                                                          'file': r['file']}])
    return table

In [14]:
def load_on_off(folder: Path, pattern: str = '*Driver Online Offline.csv') -> PeriodTable:
    table = find_table(pattern, folder,
                       ['begin_timestamp_local', 'end_timestamp_local', 'earner_state',
                        'begin_lat', 'begin_lng', 'end_lat', 'end_lng'])
    table.rename(columns={'begin_timestamp_local': 'begin', 'end_timestamp_local': 'end',
                          'earner_state': 'status'}, inplace=True)
    table = table.replace({r'\N': np.nan, 'ontrip': 'P3', 'enroute': 'P2', 'open': 'P1', 'offline': 'P0'})
    for col in ['begin', 'end']:
        table[col] = pd.to_datetime(table[col])
    return table.dropna()

In [15]:
def load_dispatches(folder: Path, pattern: str = 'TODO') -> Table:
    # TODO (not finished)
    table = find_table(pattern, folder,
                       ['start_timestamp_local', 'end_timestamp_local', 'dispatches', 'completed_trips',
                        'accepts', 'rejects', 'expireds', 'driver_cancellations', 'rider_cancellations',
                        'minutes_online', 'minutes_on_trip', 'trip_fares'])
    return table

In [16]:
def load_trip_status(folder: Path, pattern: str = '*Driver Trip Status.csv') -> PeriodTable:
    table = find_table(pattern, folder, ['begin_timestamp_local', 'end_timestamp_local', 'status', 'end_reason'])
    table.columns = ['begin', 'end', *table.columns[2:]]
    for col in ['begin', 'end']:
        table[col] = pd.to_datetime(table[col])
    return table

def load_distance_traveled(folder: Path, pattern: str = 'TODO') -> pd.DataFrame:
    pass  # TODO

In [17]:
def load_sar(folder: Path) -> Table:
    lifetime_trips = load_lifetime_trips(folder)
    on_off = load_on_off(folder)
    return pd.concat([lifetime_trips, on_off]).reset_index(drop=True)

#### Portal preprocessing

In [18]:
def load_portal(folder: Path):
    df = find_table('*driver_lifetime_trips*.csv', folder,
                    ['Status', 'Local Request Timestamp', 'Begin Trip Local Timestamp', 'Local Dropoff Timestamp',
                     'Trip Distance (miles)', 'Duration (Seconds)', 'Local Original Fare'])
    df = df[df['Status'] == 'completed']
    df = time_tuples_to_periods(df, columns=['Local Request Timestamp', 'Begin Trip Local Timestamp',
                                             'Local Dropoff Timestamp'],
                                extra_info=[lambda r: {'status': 'P2'},
                                            lambda r: {'status': 'P3',
                                                       'distance_km': mile2km(r['Trip Distance (miles)']),
                                                       'uber_paid': r['Local Original Fare']}])
    return df

#### Guillaume-specific logic

In [19]:
def guillaume_filtering_logic(
        daily: Table,
        percentage_df_path: Optional[str | Path] = data_folder / 'Guillaume' / 'percentage.csv'
) -> Table:
    # First, weight P1 times based on the percentage that Guillaume was working for Uber on that month
    percentage = pd.read_csv(percentage_df_path)
    percentage['Uber'] /= 100
    filtered = pd.DataFrame(index=daily.index)
    filtered['datetime'] = pd.to_datetime(daily['date'])
    duration_P1_cols = list(filter(lambda col: all(f in col for f in ['duration_h', 'P1']), daily.columns))
    for c in duration_P1_cols:
        for i, row in percentage.iterrows():
            filtered[c] = np.where(
                (filtered.datetime.dt.year == row.year) & (filtered.datetime.dt.month == row.month),
                daily[c] * row['Uber'], daily[c])

    def date_range(from_date: Tuple[int, ...], to_date: Tuple[int, ...]) -> list[datetime.date]:
        return pd.date_range(datetime.date(*from_date), datetime.date(*to_date)).values

    # Second, remove all morning weekday entries when Guillaume was working for IMAD, except for the specific dates below
    dates_to_keep = [datetime.date(2020, 11, 26), *date_range((2020, 12, 21), (2020, 12, 25)),
                     *date_range((2021, 2, 1), (2021, 2, 12)), *date_range((2021, 8, 16), (2021, 8, 28)),
                     *date_range((2021, 9, 20), (2021, 10, 3)), *date_range((2021, 11, 25), (2021, 12, 12)),
                     *date_range((2022, 4, 25), (2022, 5, 13))]
    duration_P1_weekday_AM_cols = list(
        filter(lambda col: all(f in col for f in ['duration_h', 'P1', 'AM', 'weekday']), daily.columns))
    filtered.loc[
        daily[~filtered.datetime.apply(lambda d: d.date()).isin(dates_to_keep)].index, duration_P1_weekday_AM_cols] = 0
    return filtered[duration_P1_cols].rename(columns={col: f'{col}(filtered)' for col in duration_P1_cols})

In [20]:
def split_AM_PM(table: PeriodTable) -> PeriodTable:
    """
    Splits intervals spanning many days or many morning/afternoon periods in two.
    If the interval is associated to numerical values (like distance or money), these values are
    transferred to the new intervals but are weighted according to the new intervals' duration.
    :param table: the table whose intervals should be split
    :return: a table with no intervals spanning over AM and PM
    """

    def scaled_interval(begin: Timestamp, end: Timestamp, attributes: dict, og_duration) -> dict:
        """Creates an interval (a dict with begin, end and other attributes) given the original attributes and the original """
        return {'begin': begin, 'end': end,
                **{k: v * (end - begin) / og_duration if isinstance(v, float) else v for k, v in attributes.items()}}

    def rec(begin: Timestamp, end: Timestamp, **rest) -> list[dict]:
        og_duration = end - begin
        # Check if the interval spans many days, and split into as many days as it spans
        if begin.day != end.day:
            rows = [scaled_interval(begin, begin.replace(hour=23, minute=59, second=59), rest, og_duration)]
            for days in range(end.day - begin.day - 1):
                mid = begin + datetime.timedelta(days=days)
                rows.append(scaled_interval(mid.replace(hour=0, minute=0, second=0),
                                            mid.replace(hour=23, minute=59, second=59), rest, og_duration))
            rows.append(scaled_interval(end.replace(hour=0, minute=0, second=0), end, rest, og_duration))
            # Call itself recursively to split resulting days into AM/PM
            return [e for r in rows for e in rec(**r)]
        # Check if the interval spans many AM/PM periods
        elif begin.hour < 12 <= end.hour:
            return [scaled_interval(begin, end.replace(hour=11, minute=59, second=59), rest, og_duration),
                    scaled_interval(end.replace(hour=12, minute=0, second=0), end, rest, og_duration)]
        else:
            return [{'begin': begin, 'end': end, **rest}]

    return pd.DataFrame([e for d in table.to_dict('records') for e in rec(**d)])

In [21]:
def split_hours(table: PeriodTable) -> PeriodTable:
    """
    Splits intervals spanning many hours periods in as many intervals as hours covered by the interval.
    If the interval is associated to numerical values (like distance or money), these values are
    transferred to the new intervals but are weighted according to the new intervals' duration.
    :param table: the table whose intervals should be split
    :return: a table with no intervals spanning over AM and PM
    """

    def scaled_interval(begin: Timestamp, end: Timestamp, attributes: dict, og_duration) -> dict:
        """Creates an interval (a dict with begin, end and other attributes) given the original attributes and the original """
        return {'begin': begin, 'end': end,
                **{k: v * (end - begin) / og_duration if isinstance(v, float) else v for k, v in attributes.items()}}

    def rec(begin: Timestamp, end: Timestamp, **rest) -> list[dict]:
        og_duration = end - begin
        # Check if the interval spans many days, and split into as many days as it spans
        if begin.hour != end.hour:
            rows = [scaled_interval(begin, begin.replace(minute=59, second=59), rest, og_duration)]
            for hours in range(end.hour - begin.hour - 1):
                mid = begin + datetime.timedelta(hours=hours)
                rows.append(scaled_interval(mid.replace(minute=0, second=0),
                                            mid.replace(minute=59, second=59), rest, og_duration))
            rows.append(scaled_interval(end.replace(minute=0, second=0), end, rest, og_duration))
            return rows
        return [{'begin': begin, 'end': end, **rest}]

    return pd.DataFrame([e for d in table.to_dict('records') for e in rec(**d)])

In [22]:
def find_week_limits(date: Timestamp) -> str:
    week_start = date - datetime.timedelta(days=date.weekday())
    week_end = week_start + datetime.timedelta(days=6)
    return f'{week_start.date()} to {week_end.date()}'.replace('-', '/').replace('to', '-')

In [23]:
def select(d: dict[str], keep: Optional[list[str]] = None, drop: Optional[list[str]] = None) -> dict[str]:
    assert (keep is None) != (drop is None), 'Only one of keep or drop can be specified'
    if keep is not None:
        return {k: v for k, v in d.items() if k in keep}
    if drop is not None:
        return {k: v for k, v in d.items() if k not in drop}

### Running the pipeline

In [24]:
all_facets = ['duration_h', 'distance_km', 'uber_paid']
all_time_properties = {'day_of_week': lambda d: d.day_name(),
                       'day_type': lambda d: 'weekday' if d.weekday() < 5 else 'weekend',
                       'time_of_day': lambda d: 'AM' if d.hour < 12 else 'PM',
                       'night': lambda d: 'night' if d.hour <= 6 or 23 < d.hour else 'day'}

In [25]:
def pipeline(
        periods: PeriodTable,
        interval_logic: Optional[Callable[[PeriodTable], PeriodTable]] = None,
        filtering_logic: Optional[Callable[[PeriodTable], PeriodTable]] = None,
        time_properties: Optional[dict[str, Callable[[Timestamp], Any]]] = None,
        name: str = 'analysis',
        facets: list[str] = all_facets,
        save_at: Path = data_folder / 'results',
        compute_most_lucrative_months: bool = True,
) -> Tuple[pd.DataFrame, ...]:
    # Apply interval logic if specified
    if interval_logic is not None:
        periods = interval_logic(periods)

    # Compute the duration of a period once, in the beginning
    periods['duration_h'] = (periods.end - periods.begin) / datetime.timedelta(hours=1)
    # Split intervals spanning many hours
    periods = split_hours(periods)

    # Pivot table so that each there is a single line per interval and per granularity of interest
    periods = periods.pivot(index=['begin', 'end'], columns=['status'], values=facets).reset_index()
    periods[periods == 0] = np.nan
    periods.drop(columns=periods.columns[periods.isna().all()],
                 inplace=True)  # this and previous remove columns that have only 0/nans
    # Merges column multiindex into a single index by joining the levels
    periods.columns = periods.columns.map(lambda t: '.'.join(t) if t[1] else t[0])

    agg_dict = {c: 'sum' for c in periods.columns if any(f in c for f in facets)}

    date_info = list(time_properties.keys()) if time_properties is not None else []

    # Compute these datetime properties since they will be the same for begin and end (thanks to split_hours)
    if time_properties is not None:
        for k, f in time_properties.items():
            periods[k] = periods.end.apply(f)
        periods = periods.pivot(index=['begin', 'end'], columns=date_info, values=list(agg_dict.keys())).reset_index()
        periods.columns = periods.columns.map(lambda t: '.'.join(t) if t[1] else t[0])

    agg_dict = {c: 'sum' for c in periods.columns if any(f in c for f in facets)}

    periods['hour'] = periods.end.apply(lambda d: f'{d.hour}-{(d + datetime.timedelta(hours=1)).hour}')
    periods['date'] = periods.end.dt.date
    periods['week'] = periods.end.apply(find_week_limits)
    periods['month'] = periods.end.apply(lambda d: f'{d.month:02d}. {d.month_name()}')
    periods['year'] = periods.end.dt.year

    hourly = periods.groupby(['date', 'hour']).agg(agg_dict).reset_index()
    daily = periods.groupby(['date', 'year', 'month', 'week']).agg(agg_dict).reset_index()

    # Filter the data if a filtering function is specified
    if filtering_logic is not None:
        filtered = filtering_logic(daily)
        daily = pd.concat([daily, filtered], axis=1)
        agg_dict = {**agg_dict, **{c: 'sum' for c in filtered.columns}}

    weekly = daily.groupby(['week']).agg(agg_dict).reset_index()
    monthly = daily.groupby(['year', 'month']).agg(agg_dict).reset_index()
    yearly = daily.groupby(['year']).agg(agg_dict).reset_index()
    total = yearly.agg(agg_dict).to_frame().T

    if compute_most_lucrative_months:
        most_lucrative_months: Table = monthly.copy()
        most_lucrative_months['uber_paid_total'] = monthly[[c for c in monthly.columns if 'uber_paid' in c]].sum(axis=1)
        most_lucrative_months.sort_values('uber_paid_total', ascending=False, inplace=True)
        most_lucrative_months.to_csv(save_at / f'{name}_4_bis_most_lucrative_months.csv', index=False,
                                     float_format='%.2f')

    # Writing tables to disk
    save_at.mkdir(parents=True, exist_ok=True)
    periods.to_csv(save_at / f'{name}_0_periods.csv', index=False, float_format='%.2f')
    hourly.to_csv(save_at / f'{name}_1_hourly.csv', index=False, float_format='%.2f')
    daily.to_csv(save_at / f'{name}_2_daily.csv', index=False, float_format='%.2f')
    weekly.to_csv(save_at / f'{name}_3_weekly.csv', index=False, float_format='%.2f')
    monthly.to_csv(save_at / f'{name}_4_monthly.csv', index=False, float_format='%.2f')
    yearly.to_csv(save_at / f'{name}_5_yearly.csv', index=False, float_format='%.2f')
    total.to_csv(save_at / f'{name}_6_total.csv', index=False, float_format='%.2f')

    return hourly, daily, weekly, yearly, total

You can find SAR samples in the KDrive at:
- old: `hestiaai /Common documents/HestiaLabs/PDIO- Data/Driver Data/Guillaume data/Lemoine Guillaume/Lemoine_SAR_06.08.2022.zip`.
- new: `PersonalData.IO /Lemoine_12102022-20221110T164542Z-001.zip`

In [26]:
_ = pipeline(load_sar(data_folder / 'Guillaume' / 'raw' / 'SAR (new)'),
             interval_logic=lambda t: merge_overlapping_intervals(t, {c: 'sum' for c in ['uber_paid', 'distance_km']}),
             filtering_logic=guillaume_filtering_logic,
             name='sar', time_properties=select(all_time_properties, keep=['time_of_day', 'day_type']),
             save_at=data_folder / 'Guillaume' / 'results')

In [27]:
_ = pipeline(load_sar(data_folder / 'Kidane' / 'raw' / 'SAR'),
             interval_logic=lambda t: merge_overlapping_intervals(t, {c: 'sum' for c in ['uber_paid', 'distance_km']}),
             name='sar', time_properties=all_time_properties,
             save_at=data_folder / 'Kidane' / 'results')

In [28]:
_ = pipeline(load_sar(data_folder / 'Brice' / 'raw' / 'SAR'),
             interval_logic=lambda t: merge_overlapping_intervals(t, {c: 'sum' for c in ['uber_paid', 'distance_km']}),
             name='sar', time_properties=all_time_properties,
             save_at=data_folder / 'Brice' / 'results')

ValueError: Index contains duplicate entries, cannot reshape

A Portal sample can be found on our KDrive at `hestiaai /Common documents/HestiaLabs/PDIO- Data/Driver Data/Guillaume data/Lemoine Guillaume/202207/Uber Data F0699B53.zip`

In [29]:
_ = pipeline(load_portal(data_folder / 'Guillaume' / 'raw' / 'Portal' / 'Driver'),
             name='portal', time_properties=all_time_properties,
             filtering_logic=guillaume_filtering_logic,
             save_at=data_folder / 'Guillaume' / 'results')

In [30]:
_ = pipeline(load_portal(data_folder / 'Aria' / 'raw' / 'Portal' / 'Driver'),
             name='portal', time_properties=select(all_time_properties, keep=['night', 'day_type']),
             save_at=data_folder / 'Aria' / 'results')