In [3]:
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

### Load data

In [4]:
df = pd.read_csv(
    "../data/data_scientist_case.csv",
    low_memory=False,
    thousands=",",
    parse_dates=[
        "segment_datetime",
        "published_date",
        "signup_date",
    ],
    dtype={"trip_id": str, "segment_id": str}
)

df = df.sort_values(by=["trip_id", "segment_id"])
df = df.set_index(["trip_id", "segment_id"])

df["from"] = list(zip(df["from_lat"], df["from_lon"]))
df["to"] = list(zip(df["to_lat"], df["to_lon"]))

df.drop(["from_lat", "from_lon", "to_lat", "to_lon"], axis=1, inplace=True)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,driver_id,segment_datetime,published_date,signup_date,fixed_signup_country,is_main_segment,unit_seat_price_eur,seat_offered_count,seat_left_count,confirmed_seat_count,segment_distance_km,is_comfort,is_auto_accept_mode,publication_site_id,from,to
trip_id,segment_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
101237714,578145698,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,6.60,1,1,0,93,False,False,5000,"(42.5462, 3.02291)","(43.1843, 3.00308)"
101237714,578145704,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,10.56,1,1,0,153,False,False,5000,"(42.5462, 3.02291)","(43.2122, 2.35366)"
101237714,578145710,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,18.48,1,1,0,268,False,False,5000,"(42.5462, 3.02291)","(42.6887, 2.89483)"
101237714,578145716,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,31.68,1,0,0,474,False,False,5000,"(42.5462, 3.02291)","(43.6047, 1.44421)"
101237714,578145722,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,44.88,1,0,0,719,False,False,5000,"(42.5462, 3.02291)","(44.8378, -0.57918)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81503790,483346561,3899249,2015-05-09 13:20:00,2015-01-13,2012-06-17,FR,False,27.72,2,0,0,295,False,False,1000,"(44.9334, 4.89236)","(47.322, 5.04148)"
81503790,483346567,3899249,2015-05-09 13:20:00,2015-01-13,2012-06-17,FR,False,44.88,2,0,0,489,False,False,1000,"(44.9334, 4.89236)","(48.1724, 6.44959)"
81503790,483346573,3899249,2015-05-09 14:25:00,2015-01-13,2012-06-17,FR,False,17.16,2,0,0,195,False,False,1000,"(45.7366, 4.8163)","(47.322, 5.04148)"
81503790,483346579,3899249,2015-05-09 14:25:00,2015-01-13,2012-06-17,FR,False,34.32,2,0,0,389,False,False,1000,"(45.7366, 4.8163)","(48.1724, 6.44959)"


### Segments

If a driver is travelling from Paris to Toulouse with a stopover in Tours and a stopover in Limoge, there will be 6 segments:

* Paris>Tours (direct)
* Tours>Limoge (direct)
* Limoge>Toulouse (direct)
* Paris>Limoge (indirect)
* Paris>Toulouse (indirect)
* Tours>Toulouse (indirect)

A segment is either a direct trip between two locations or an indirect trip composed of constinuent segments. I suspect that whether a segment is direct or indirect would be a good predictor of success. The number of constituent segments will likely also play a role, as well as the duration of stops.

In [102]:
df.loc["101,237,714"]

Unnamed: 0_level_0,driver_id,segment_datetime,published_date,signup_date,fixed_signup_country,is_main_segment,unit_seat_price_eur,seat_offered_count,seat_left_count,confirmed_seat_count,segment_distance_km,is_comfort,is_auto_accept_mode,publication_site_id,from,to
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
578145698,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,6.6,1,1,0,93,False,False,5000,"(42.5462, 3.02291)","(43.1843, 3.00308)"
578145704,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,10.56,1,1,0,153,False,False,5000,"(42.5462, 3.02291)","(43.2122, 2.35366)"
578145710,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,18.48,1,1,0,268,False,False,5000,"(42.5462, 3.02291)","(42.6887, 2.89483)"
578145716,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,31.68,1,0,0,474,False,False,5000,"(42.5462, 3.02291)","(43.6047, 1.44421)"
578145722,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,44.88,1,0,0,719,False,False,5000,"(42.5462, 3.02291)","(44.8378, -0.57918)"
578145728,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,False,52.8,1,0,0,845,False,False,5000,"(42.5462, 3.02291)","(45.6484, 0.156237)"
578145734,8370779,2015-05-03 10:00:08,2015-02-19,2013-08-17,FR,True,59.4,1,0,0,963,False,False,5000,"(42.5462, 3.02291)","(46.5802, 0.340375)"
578145740,8370779,2015-05-03 11:00:08,2015-02-19,2013-08-17,FR,False,3.96,1,1,0,60,False,False,5000,"(43.1843, 3.00308)","(43.2122, 2.35366)"
578145746,8370779,2015-05-03 11:00:08,2015-02-19,2013-08-17,FR,False,11.88,1,1,0,174,False,False,5000,"(43.1843, 3.00308)","(42.6887, 2.89483)"
578145752,8370779,2015-05-03 11:00:08,2015-02-19,2013-08-17,FR,False,25.08,1,0,0,381,False,False,5000,"(43.1843, 3.00308)","(43.6047, 1.44421)"


In [107]:
from collections import defaultdict
from itertools import tee

from time import perf_counter


def compute_segment_features(trip_id):
    t1 = perf_counter()
    trip_data = df.loc[trip_id]

    dic = defaultdict(list)
    point_pair_to_segment = {}

    points = set(trip_data["from"])
    points.add(trip_data.iloc[-1]["to"])

    for segment_id, row in trip_data.iterrows():
        point_pair_to_segment[row["from"] + row["to"]] = segment_id
        dic[row["from"]].append(row["to"])

    time_first_for = perf_counter() - t1

    t1 = perf_counter()
    dic = {k: len(dic[k]) for k in points}
    sorted_locations = sorted(list(points), key=lambda x: dic[x], reverse=True)
    time_sorting = perf_counter() - t1

    t1 = perf_counter()
    seg_to_seg = defaultdict(list)
    for i in range(len(sorted_locations)):
        cur_segments = []
        for j in range(i + 1, len(sorted_locations)):
            cur_segments.append(point_pair_to_segment[(sorted_locations[j-1] + sorted_locations[j])])
            seg_to_seg[point_pair_to_segment[(sorted_locations[i] + sorted_locations[j])]] = [x for x in cur_segments]

    time_second_for = perf_counter() - t1
    
    t1 = perf_counter()
    sorted_core_segments = []
    for i in range(len(sorted_locations) - 1):
        sorted_core_segments.append(point_pair_to_segment[sorted_locations[i] + sorted_locations[i + 1]])
    time_third_for = perf_counter() - t1

    t1 = perf_counter()
    segment_durations = {}
    for i in range(len(sorted_core_segments) - 1):
        s1 = sorted_core_segments[i]
        s2 = sorted_core_segments[i + 1]
        duration = trip_data.loc[s2, "segment_datetime"] - trip_data.loc[s1, "segment_datetime"]
        duration = duration.seconds / 3600
        if duration != 0:
            segment_durations[s1] = duration
        else:
            segment_durations[s1] = None

    time_fourth_for = perf_counter() - t1

    t1 = perf_counter()
    for segment in trip_data.index:
        if segment not in segment_durations:
            subsegs = seg_to_seg[segment]
            s = 0
            all_known = True
            for seg in subsegs:
                if seg in segment_durations and segment_durations[seg] != None:
                    s += segment_durations[seg]
                else:
                    all_known = False
                    break
            if all_known:
                segment_durations[segment] = s
    time_fifth_for = perf_counter() - t1

    t1 = perf_counter()
    segment_speeds = {}
    for segment in segment_durations:
        duration = segment_durations[segment]
        if duration != None:
            segment_speeds[segment] = trip_data.loc[segment, "segment_distance_km"] / segment_durations[segment]
        else:
            segment_speeds[segment] = None
    time_sixth_for = perf_counter() - t1
    
    t1 = perf_counter()
    segment_lengths = {}
    for segment in seg_to_seg:
        segment_lengths[segment] = len(seg_to_seg[segment])
    time_seventh_for = perf_counter() - t1

    t1 = perf_counter()
    res = {}
    for segment in segment_lengths:
        res[segment] = {}
        res[segment]["length"] = segment_lengths[segment]
        res[segment]["duration"] = segment_durations.get(segment)
        res[segment]["speed"] = segment_speeds.get(segment)
    time_eighth_for = perf_counter() - t1

    return res, time_first_for, time_sorting, time_second_for, time_third_for, time_fourth_for, time_fifth_for, time_sixth_for, time_seventh_for, time_eighth_for

In [112]:
t1s = 0
t2s = 0
t3s = 0
t4s = 0
t5s = 0
t6s = 0
t7s = 0
t8s = 0
t9s = 0

for i in range(100):
    features, t1, t2, t3, t4, t5, t6, t7, t8, t9 = compute_segment_features("101,237,714")
    t1s += t1
    t2s += t2
    t3s += t3
    t4s += t4
    t5s += t5
    t6s += t6
    t7s += t7
    t8s += t8
    t9s += t9

print(t1s / 100)
print(t2s / 100)
print(t3s / 100)
print(t4s / 100)
print(t5s / 100)
print(t6s / 100)
print(t7s / 100)
print(t8s / 100)
print(t9s / 100)

0.008199907109810738
9.34060983126983e-06
3.3017789683071896e-05
2.869750460376963e-06
0.000409587049507536
2.2214271157281474e-05
0.0003632804600056261
4.976620402885601e-06
1.2944671470904723e-05


In [117]:
from tqdm import tqdm
from multiprocessing import Pool

pool = Pool(10)

tt = 0
for i in tqdm(range(100)):
    t1 = perf_counter()
    segment_features = pool.map(compute_segment_features, df.index.levels[0][:100])
    t = perf_counter() - t1
    tt += t

print(tt / 100)

100%|██████████████████████████████████████████████████████████████| 100/100 [00:56<00:00,  1.75it/s]

0.5688167269705445





In [115]:
segment_features

[({'578,145,698': {'length': 1, 'duration': 1.0, 'speed': 93.0},
   '578,145,704': {'length': 2,
    'duration': 1.6666666666666665,
    'speed': 91.80000000000001},
   '578,145,710': {'length': 3, 'duration': 2.75, 'speed': 97.45454545454545},
   '578,145,716': {'length': 4,
    'duration': 4.666666666666667,
    'speed': 101.57142857142857},
   '578,145,722': {'length': 5,
    'duration': 6.916666666666667,
    'speed': 103.95180722891565},
   '578,145,728': {'length': 6,
    'duration': 8.333333333333334,
    'speed': 101.39999999999999},
   '578,145,734': {'length': 7, 'duration': None, 'speed': None},
   '578,145,740': {'length': 1, 'duration': 0.6666666666666666, 'speed': 90.0},
   '578,145,746': {'length': 2, 'duration': 1.75, 'speed': 99.42857142857143},
   '578,145,752': {'length': 3,
    'duration': 3.666666666666667,
    'speed': 103.9090909090909},
   '578,145,758': {'length': 4,
    'duration': 5.916666666666667,
    'speed': 105.80281690140845},
   '578,145,764': {'length

In [8]:
ms["segment_datetime"] = pd.to_datetime(ms["segment_datetime"])
ms["published_date"] = pd.to_datetime(ms["published_date"])
ms["signup_date"] = pd.to_datetime(ms["signup_date"])

In [9]:
ms["hours_before_start"] = (ms["segment_datetime"] - ms["published_date"]).apply(lambda x: x.seconds/3600)

In [10]:
ms["days_since_signup"] = (ms["published_date"] - ms["signup_date"]).apply(lambda x: x.days)

In [11]:
one_hot = pd.get_dummies(ms["fixed_signup_country"])
ms.drop("fixed_signup_country", axis=1, inplace=True)
ms = ms.join(one_hot)

one_hot = pd.get_dummies(ms["publication_site_id"])
ms.drop("publication_site_id", axis=1, inplace=True)
ms = ms.join(one_hot)

In [12]:
import numpy as np

from sklearn.cluster import MiniBatchKMeans

coords = np.vstack((ms[['from_lat', 'from_lon']].values,
                    ms[['to_lat', 'to_lon']].values))

sample_ind = np.random.permutation(len(coords))
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

ms.loc[:, 'from_cluster'] = kmeans.predict(ms[['from_lat', 'from_lon']].values)
ms.loc[:, 'to_cluster'] = kmeans.predict(ms[['to_lat', 'to_lon']].values)

In [13]:
min(ms["segment_datetime"]), max(ms["segment_datetime"])

(Timestamp('2015-04-15 00:00:00'), Timestamp('2015-10-12 22:50:00'))

In [14]:
cluster_in_counts = dict(ms["to_cluster"].value_counts())
cluster_out_counts = dict(ms["from_cluster"].value_counts())

In [15]:
ms["from_cluster_in_count"] = ms["from_cluster"].apply(lambda x: cluster_in_counts[x]) / len(ms)
ms["from_cluster_out_count"] = ms["from_cluster"].apply(lambda x: cluster_out_counts[x]) / len(ms)
ms["to_cluster_in_count"] = ms["to_cluster"].apply(lambda x: cluster_in_counts[x]) / len(ms)
ms["to_cluster_out_count"] = ms["to_cluster"].apply(lambda x: cluster_out_counts[x]) / len(ms)

In [16]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [17]:
ms["unit_seat_price_eur"] = ms["unit_seat_price_eur"].apply(lambda x: float(x.replace(",", "")))
ms["segment_distance_km"] = ms["segment_distance_km"].apply(lambda x: float(x.replace(",", "")))

In [18]:
ms["haversine_distance"] = haversine_array(ms["from_lat"], ms["from_lon"], ms["to_lat"], ms["to_lon"])
ms["over_lenghtification"] = ms["segment_distance_km"] / ms["haversine_distance"]

### Datetime Features

In [19]:
ms.loc[:, 'pickup_weekday'] = ms['segment_datetime'].dt.weekday
ms.loc[:, 'pickup_hour_weekofyear'] = ms['segment_datetime'].dt.weekofyear
ms.loc[:, 'pickup_hour'] = ms['segment_datetime'].dt.hour
ms.loc[:, 'pickup_minute'] = ms['segment_datetime'].dt.minute
ms.loc[:, 'pickup_dt'] = (ms['segment_datetime'] - ms['segment_datetime'].min()).dt.total_seconds()
ms.loc[:, 'pickup_week_hour'] = ms['pickup_weekday'] * 24 + ms['pickup_hour']

  


In [20]:
ms.loc[:, "target"] = (ms["seat_offered_count"] > ms["seat_left_count"]).astype(int)

In [21]:
ms.drop(["driver_id", "trip_id", "segment_id", "confirmed_seat_count", "seat_left_count"], inplace=True, axis=1)

In [22]:
ms.drop(["is_main_segment", "segment_datetime", "published_date", "signup_date"], inplace=True, axis=1)

In [23]:
ms["target"].value_counts()

1    1693951
0    1685827
Name: target, dtype: int64

In [37]:
ms = ms[~ms.isin([np.nan, np.inf, -np.inf]).any(1)]

In [38]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ms, test_size=0.2)

In [39]:
X_train, Y_train = train.values[:, :-1], train.values[:, -1]
X_test, Y_test = test.values[:, :-1], test.values[:, -1]

In [44]:
from xgboost import XGBClassifier

model = XGBClassifier(
    eval_metric="logloss",
    use_label_encoder=False,
    n_estimators=10
)
model.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [45]:
Y_pred = model.predict(X_test)

In [46]:
from sklearn.metrics import classification_report

print(classification_report(Y_test.astype(float), Y_pred.astype(float)))

              precision    recall  f1-score   support

         0.0       0.66      0.63      0.64    336894
         1.0       0.65      0.67      0.66    339049

    accuracy                           0.65    675943
   macro avg       0.65      0.65      0.65    675943
weighted avg       0.65      0.65      0.65    675943



In [47]:
pd.DataFrame(zip(train.columns[:-1], model.feature_importances_)).set_index(0).sort_values(by=1, ascending=False).plot.bar(figsize=(12, 8))

KeyboardInterrupt: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

points = set(froms + tos)
x = []
y = []
for p in points:
    x.append(p[0])
    y.append(p[1])

plt.figure(figsize=(12, 8), dpi=80)
plt.scatter(x, y, s=0.1)
plt.show()

In [None]:
ms