# KOTLITE Matching Algorithm

**Kotlite** (Angkot Elite) is an application that allows drivers to get passengers who have the same lane. This application is expected to parse existing congestion using the concept of ridesharing, in which passengers will get the experience of driving using a private car or taxi, but get a fairly cheap price similar to the price of public transportation. By using the machine learning algorithm, it is possible to match drivers and passengers who have the same routes.

in this case the dataset used is NYC Taxi trip duration obtained from [Kaggle](https://www.kaggle.com/debanjanpaul/new-york-city-taxi-trip-distance-matrix). In this dataset, there are pickup locations and dropoff locations that will try to be used to match drivers and passengers. Existing data will be manipulated and will be separated as driver data and passenger data.



In [None]:
import pandas as pd
import numpy as np
from geopy import distance
import datetime

import warnings
warnings.filterwarnings('ignore')

## Read the dataset

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/Brillante Workspace/ML Corner/Dataset/NYC_dataset/train_distance_matrix.csv')
df.describe(include='all')

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,gc_distance,trip_duration,google_distance,google_duration
count,39396,39396.0,39396,39396,39396.0,39396.0,39396.0,39396.0,39396.0,39396,39396.0,39396.0,38837.0,38837.0
unique,39396,,9934,9949,,,,,,2,,,,
top,id0216494,,1/6/2016 8:26,1/1/2016 1:25,,,,,,N,,,,
freq,1,,16,16,,,,,,39166,,,,
mean,,1.540512,,,1.691669,-73.971934,40.750858,-73.973048,40.752142,,2.166888,974.5813,4937.839097,821.085614
std,,0.498362,,,1.33244,0.041576,0.040643,0.045388,0.032743,,3.769393,15249.26,6184.438363,509.541421
min,,1.0,,,0.0,-74.345078,35.081532,-79.352837,40.436329,,0.0,2.0,0.0,0.0
25%,,1.0,,,1.0,-73.991318,40.737885,-73.990822,40.736804,,0.753846,359.0,1639.0,453.0
50%,,2.0,,,1.0,-73.981133,40.754929,-73.979263,40.755201,,1.281959,589.0,2733.0,702.0
75%,,2.0,,,2.0,-73.966316,40.768787,-73.962667,40.770051,,2.377999,949.0,5174.0,1073.0


In [None]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,id0190469,id1665586,id1078247,id3609443,id3888279
vendor_id,2,1,2,1,1
pickup_datetime,1/1/2016 0:00,1/1/2016 0:00,1/1/2016 0:01,1/1/2016 0:01,1/1/2016 0:01
dropoff_datetime,1/1/2016 0:14,1/1/2016 0:22,1/1/2016 0:03,1/1/2016 0:21,1/1/2016 0:05
passenger_count,5,1,1,2,1
pickup_longitude,-73.9817,-73.9851,-73.9733,-73.9931,-73.9823
pickup_latitude,40.7192,40.7472,40.7641,40.7526,40.7513
dropoff_longitude,-73.9388,-73.958,-73.9749,-73.9539,-73.9913
dropoff_latitude,40.8292,40.7175,40.7617,40.8165,40.7503
store_and_fwd_flag,N,N,N,N,N


## Selected the features

In [None]:
def selected_features(dataframe):
  select_col = ['id', 'pickup_datetime', 'pickup_latitude', 'pickup_longitude',
                'dropoff_latitude', 'dropoff_longitude']
  return dataframe[select_col]

In [None]:
df = selected_features(df)
df.describe(include='all')

Unnamed: 0,id,pickup_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
count,39396,39396,39396.0,39396.0,39396.0,39396.0
unique,39396,9934,,,,
top,id0216494,1/6/2016 8:26,,,,
freq,1,16,,,,
mean,,,40.750858,-73.971934,40.752142,-73.973048
std,,,0.040643,0.041576,0.032743,0.045388
min,,,35.081532,-74.345078,40.436329,-79.352837
25%,,,40.737885,-73.991318,40.736804,-73.990822
50%,,,40.754929,-73.981133,40.755201,-73.979263
75%,,,40.768787,-73.966316,40.770051,-73.962667


In [None]:
df.head()

Unnamed: 0,id,pickup_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,id0190469,1/1/2016 0:00,40.719158,-73.981743,40.829182,-73.938828
1,id1665586,1/1/2016 0:00,40.747166,-73.985085,40.717491,-73.958038
2,id1078247,1/1/2016 0:01,40.764072,-73.973335,40.761734,-73.974854
3,id3609443,1/1/2016 0:01,40.752632,-73.993103,40.81654,-73.953903
4,id3888279,1/1/2016 0:01,40.751331,-73.982292,40.75034,-73.991341


In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df.head()

Unnamed: 0,id,pickup_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,id0190469,2016-01-01 00:00:00,40.719158,-73.981743,40.829182,-73.938828
1,id1665586,2016-01-01 00:00:00,40.747166,-73.985085,40.717491,-73.958038
2,id1078247,2016-01-01 00:01:00,40.764072,-73.973335,40.761734,-73.974854
3,id3609443,2016-01-01 00:01:00,40.752632,-73.993103,40.81654,-73.953903
4,id3888279,2016-01-01 00:01:00,40.751331,-73.982292,40.75034,-73.991341


In [None]:
dt = df.loc[0, 'pickup_datetime']
diff = datetime.timedelta(hours=1)
hasil = dt + diff
new_filter = df.loc[(df['pickup_datetime'] < hasil) & (df['pickup_datetime'] > dt)]
new_filter

Unnamed: 0,id,pickup_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
2,id1078247,2016-01-01 00:01:00,40.764072,-73.973335,40.761734,-73.974854
3,id3609443,2016-01-01 00:01:00,40.752632,-73.993103,40.816540,-73.953903
4,id3888279,2016-01-01 00:01:00,40.751331,-73.982292,40.750340,-73.991341
5,id1210365,2016-01-01 00:01:00,40.801041,-73.965279,40.815170,-73.947479
6,id0924227,2016-01-01 00:01:00,40.759800,-73.970108,40.742989,-73.989357
...,...,...,...,...,...,...
378,id0679236,2016-01-01 00:59:00,40.767090,-73.921753,40.773075,-73.933899
379,id0055855,2016-01-01 00:59:00,40.746033,-73.984169,40.752743,-73.978569
380,id3296007,2016-01-01 00:59:00,40.757404,-73.982422,40.638847,-73.914459
381,id3141408,2016-01-01 00:59:00,40.733440,-73.993240,40.777748,-73.988762


## Split the dataset

The dataset will be separated by 20k for the dummy driver data and 5k for the dummy passenger data.

In [None]:
rd_dummy = df.sample(20000)
rd_dummy = rd_dummy.reset_index(drop=True)
ps_dummy = df.sample(5000)
ps_dummy = ps_dummy.reset_index(drop=True)

In [None]:
rd_dummy.head(10)

Unnamed: 0,id,pickup_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,id3467693,2016-01-03 13:58:00,40.78014,-73.95726,40.768616,-73.958107
1,id0966527,2016-01-01 09:16:00,40.75872,-73.974327,40.75639,-73.988968
2,id0245628,2016-01-04 09:25:00,40.772732,-73.94635,40.779511,-73.955322
3,id1185140,2016-01-02 06:46:00,40.785439,-73.951286,40.767361,-73.964279
4,id0410600,2016-01-07 16:41:00,40.774429,-73.963348,40.776928,-73.963654
5,id2149626,2016-01-05 09:39:00,40.776924,-73.955292,40.775955,-73.944038
6,id1078639,2016-01-07 10:10:00,40.764584,-73.964378,40.782726,-73.977203
7,id1171607,2016-01-04 07:04:00,40.758545,-73.966026,40.764793,-73.970016
8,id0490103,2016-01-08 10:01:00,40.773155,-73.962166,40.784328,-73.954353
9,id0045351,2016-01-06 15:05:00,40.71476,-74.005562,40.746094,-74.00238


In [None]:
ps_dummy.head(10)

Unnamed: 0,id,pickup_datetime,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,id1354242,2016-01-05 20:11:00,40.745205,-73.991592,40.707607,-73.951836
1,id3913091,2016-01-06 22:07:00,40.778561,-73.977692,40.731049,-73.978355
2,id2136107,2016-01-03 14:04:00,40.64444,-73.78244,40.611366,-73.750885
3,id2933181,2016-01-04 05:53:00,40.78368,-73.980118,40.778381,-73.982178
4,id2627922,2016-01-05 22:06:00,40.707619,-74.004379,40.777092,-73.975853
5,id0269078,2016-01-04 05:21:00,40.761784,-73.97802,40.648945,-73.783066
6,id1956947,2016-01-04 18:16:00,40.73344,-74.004509,40.73653,-73.998871
7,id3690703,2016-01-08 17:13:00,40.749538,-73.987831,40.737766,-73.992317
8,id2729472,2016-01-04 21:31:00,40.769501,-73.863319,40.75861,-73.990005
9,id3090411,2016-01-08 18:56:00,40.789829,-73.975166,40.765209,-73.981468


## Create Algorithm

The matching algorithm in Kotlite refers to the Nearest Neighbor algorithm. The reason we choose this algorithm is because the data is very dynamic when creating the application, so we need an algorithm model that can still match and recommend passengers who are looking for drivers who have the same route.

In [None]:
def l2_distances(point1, point2):
  l2 = 0
  for i in range(len(point1)):
    l2 += (point1[i] - point2[i]) ** 2
  return l2 ** 0.5

def l1_distances(point1, point2):
  l1 = 0
  for i in range(len(point1)):
    l1 += abs(point1[i] - point2[i])
  return l1

In [None]:
def nearest(data, query, k, dist='l2'):
  # query = [datetime, latpick, longpick, latdrop, longdrop]
  # get
  dt = query[0]
  diff = datetime.timedelta(minutes=30)
  up = dt + diff
  down = dt - diff
  data = data.loc[(data['pickup_datetime'] < up) & (data['pickup_datetime'] > down)]
  dt = data.to_numpy()

  distances_and_id = []

  if dist == 'l1':
    for values in dt:
      distance = l1_distances(values[2:], query[1:])
      distances_and_id.append((distance, values[0]))
  elif dist == 'l2':
    for values in dt:
      distance = l2_distances(values[2:], query[1:])
      distances_and_id.append((distance, values[0]))
  
  sorted_distances = sorted(distances_and_id)

  k_pickup_nearest = sorted_distances[:k]

  return k_pickup_nearest

In [None]:
test = ps_dummy.sample(1).to_numpy().reshape((-1))[1:]
test

array([Timestamp('2016-01-05 17:00:00'), 40.78491211, -73.9465332,
       40.7806015, -73.94616699], dtype=object)

In [None]:
recomendation = nearest(rd_dummy, test, 50, dist='l1')
recomendation

[(0.02167131000001632, 'id2187598'),
 (0.021987920000000827, 'id0213482'),
 (0.022869119999981535, 'id0902235'),
 (0.03194428999999843, 'id2896174'),
 (0.03268050999999872, 'id2583651'),
 (0.034656529999999464, 'id1258373'),
 (0.03748703000001541, 'id0781466'),
 (0.041904449999996984, 'id2021927'),
 (0.04351044999999232, 'id0010154'),
 (0.043567660000014996, 'id3870257'),
 (0.04606247000000252, 'id0412255'),
 (0.047676100000003885, 'id2402880'),
 (0.051120770000004256, 'id2989754'),
 (0.05180358999999157, 'id0241185'),
 (0.05340958000000029, 'id0429418'),
 (0.05457306999998934, 'id1998055'),
 (0.05669785999999988, 'id2179796'),
 (0.06034469000000797, 'id0162147'),
 (0.06070327999999847, 'id3332917'),
 (0.062465670000001694, 'id2985404'),
 (0.06262208000000413, 'id1232342'),
 (0.06377412000000504, 'id3520043'),
 (0.06399155000001144, 'id1401035'),
 (0.06483079000000203, 'id2153880'),
 (0.0649109000000152, 'id3162420'),
 (0.06550217000000202, 'id0661127'),
 (0.06759644000000975, 'id25621

In [None]:
df[df['id'] == recomendation[1][1]].to_numpy().reshape((-1))

array(['id0213482', Timestamp('2016-01-05 17:12:00'), 40.77893829,
       -73.95375824, 40.78258514, -73.95297241], dtype=object)

## Add another approach

In [None]:
def approaching2(start_point_driver, end_point_driver, 
                 pick_point_passenger, drop_point_passenger):
  # jarak tempuh
  jt_driver = distance.distance(start_point_driver, end_point_driver).km
  jt_passenger = distance.distance(pick_point_passenger, drop_point_passenger).km

  # radius from start and end point driver and passenger
  rad_start = distance.distance(start_point_driver, pick_point_passenger).km
  rad_end = distance.distance(end_point_driver, drop_point_passenger).km

  # logic
  # if jt_driver > jt_passenger:
  #   # recommend = True
  #   if (rad_start < 1) & (rad_end < 1):
  #     recommend = True
  #   else:
  #     recommend = False
  # else:
  #   if (rad_start < 1) & (rad_end < 1):
  #     recommend = True
  #   else:
  #     recommend = False

  if (rad_start < 2) & (rad_end < 2):
    recommend = True
  else:
    recommend = False
  return rad_start, rad_end, recommend

In [None]:
def recomendations(data, recommend, query):
  recommendation_by_range = []
  for rc in recommend:
    dt = data[data['id'] == rc[1]].to_numpy().reshape((-1))
    
    pass_pick = (query[1], query[2])
    pass_drop = (query[3], query[4])
    driv_start = (dt[2], dt[3])
    driv_end = (dt[4], dt[5])

    range_start, range_end, recommendation = approaching2(driv_start, driv_end, pass_pick, pass_drop)

    if recommendation == True:
      recommendation_by_range.append((range_start, range_end, dt[0]))
    
  sorted_by_range = sorted(recommendation_by_range)
  return sorted_by_range

In [None]:
recomendations(rd_dummy, recomendation, test)

[(0.5065252508311766, 0.9976090012087289, 'id2187598'),
 (0.9011326795268596, 0.615247943457385, 'id0213482'),
 (0.9050445720065832, 0.7246892447442548, 'id0902235'),
 (1.14806087070714, 1.0596251710782245, 'id2896174'),
 (1.4018615565084465, 1.5324284020038106, 'id0010154'),
 (1.4740663592808647, 1.1078745134909163, 'id1258373'),
 (1.7287372537661505, 1.1192485581117888, 'id2021927'),
 (1.7289308852797989, 0.8631603730648518, 'id0781466'),
 (1.9813499347859274, 0.7053365002765035, 'id2583651')]

In [None]:
df[df['id'] == 'id0213482'].to_numpy()

array([['id0213482', Timestamp('2016-01-05 17:12:00'), 40.77893829,
        -73.95375824, 40.78258514, -73.95297241]], dtype=object)

In [None]:
test

array([Timestamp('2016-01-05 17:00:00'), 40.78491211, -73.9465332,
       40.7806015, -73.94616699], dtype=object)

# Result

The test process is carried out using google maps. This is done to determine the level of effectiveness and accuracy of the algorithm in providing driver recommendations for the passanger. the purpose of this algorithm is to provide recommendations that are not burdensome from the driver's side.