# Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import datetime
import geopy.distance
from geopy.distance import geodesic

# Знакомство с данными

In [2]:
df = pd.read_csv('/datasets/orders_coordinates.csv')

In [3]:
df

Unnamed: 0,id,delivery_method,publication_time,provider_lat,provider_lon,recipient_lat,recipient_lon
0,456731,0,2019-09-01 10:41:11,43.102578,131.949542,43.127199,131.903531
1,456732,0,2019-09-01 16:03:31,43.102578,131.949542,43.154148,131.908536
2,456733,0,2019-09-01 11:13:00,43.101090,131.945080,43.166792,131.908541
3,456734,0,2019-09-01 10:46:09,43.109655,131.878144,43.115547,131.883194
4,456735,0,2019-09-01 16:06:01,43.113975,131.882475,43.117655,131.936357
...,...,...,...,...,...,...,...
125161,581892,0,2019-09-15 10:39:05,43.101618,131.940039,43.126637,131.939889
125162,581893,0,2019-09-15 10:39:43,43.116206,131.881526,43.153467,131.925947
125163,581894,1,2019-09-15 11:09:34,43.116206,131.881526,43.118328,131.886140
125164,581895,0,2019-09-15 11:18:44,43.120917,131.923030,43.119716,131.945241


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125166 entries, 0 to 125165
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                125166 non-null  int64  
 1   delivery_method   125166 non-null  int64  
 2   publication_time  125166 non-null  object 
 3   provider_lat      125166 non-null  float64
 4   provider_lon      125166 non-null  float64
 5   recipient_lat     125166 non-null  float64
 6   recipient_lon     125166 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 6.7+ MB


In [5]:
print("Дублирующихся строк в таблице DF:", df.duplicated().sum())

Дублирующихся строк в таблице DF: 0


In [6]:
print("Пропусков в таблице DF:", df.isna().sum().sum())

Пропусков в таблице DF: 0


# Предобработка данных

In [7]:
df['publication_time'] = pd.to_datetime(df['publication_time'], format = "%Y-%m-%d %H:%M:%S")

# Курьеры с 16:00 до 20:00.

In [8]:
df_filtered = df[df['publication_time'].dt.strftime('%H:%M:%S') >= '16:00:00']
df_filtered = df_filtered[df_filtered['publication_time'].dt.strftime('%H:%M:%S') <= '20:00:00']

In [9]:
df_filtered

Unnamed: 0,id,delivery_method,publication_time,provider_lat,provider_lon,recipient_lat,recipient_lon
1,456732,0,2019-09-01 16:03:31,43.102578,131.949542,43.154148,131.908536
4,456735,0,2019-09-01 16:06:01,43.113975,131.882475,43.117655,131.936357
31,456762,0,2019-09-01 17:24:50,43.124748,131.906214,43.119799,131.958144
34,456765,0,2019-09-01 17:24:52,43.136746,131.934127,43.121547,131.927556
35,456766,0,2019-09-01 16:04:21,43.115839,131.886799,43.088782,131.907125
...,...,...,...,...,...,...,...
125101,581832,0,2019-09-15 19:53:40,43.100421,131.903334,43.105664,131.957950
125102,581833,0,2019-09-15 19:53:35,43.101618,131.940039,43.150130,131.916641
125105,581836,0,2019-09-15 19:59:55,43.115840,131.886737,43.125029,131.935339
125106,581837,0,2019-09-15 19:59:24,43.117605,131.881390,43.123801,131.920889


# Мобильные курьеры с конечной точкой доставки заказа в радиусе 1500 метров.

In [10]:
def distancer(row):
    coords_1 = (row['provider_lat'], row['provider_lon'])
    coords_2 = (row['recipient_lat'], row['recipient_lon'])
    return geopy.distance.geodesic(coords_1, coords_2).km

df_filtered['pickup_dropoff_distance'] = df_filtered.apply(lambda row: distancer(row), axis=1)
df_filtered

Unnamed: 0,id,delivery_method,publication_time,provider_lat,provider_lon,recipient_lat,recipient_lon,pickup_dropoff_distance
1,456732,0,2019-09-01 16:03:31,43.102578,131.949542,43.154148,131.908536,6.630014
4,456735,0,2019-09-01 16:06:01,43.113975,131.882475,43.117655,131.936357,4.404345
31,456762,0,2019-09-01 17:24:50,43.124748,131.906214,43.119799,131.958144,4.261632
34,456765,0,2019-09-01 17:24:52,43.136746,131.934127,43.121547,131.927556,1.771169
35,456766,0,2019-09-01 16:04:21,43.115839,131.886799,43.088782,131.907125,3.431215
...,...,...,...,...,...,...,...,...
125101,581832,0,2019-09-15 19:53:40,43.100421,131.903334,43.105664,131.957950,4.483985
125102,581833,0,2019-09-15 19:53:35,43.101618,131.940039,43.150130,131.916641,5.715888
125105,581836,0,2019-09-15 19:59:55,43.115840,131.886737,43.125029,131.935339,4.084921
125106,581837,0,2019-09-15 19:59:24,43.117605,131.881390,43.123801,131.920889,3.287350


In [11]:
df_filtered = df_filtered.query('pickup_dropoff_distance <= 1.500 and delivery_method == 0')

In [12]:
df_filtered

Unnamed: 0,id,delivery_method,publication_time,provider_lat,provider_lon,recipient_lat,recipient_lon,pickup_dropoff_distance
38,456769,0,2019-09-01 16:22:21,43.124748,131.906214,43.129405,131.912026,0.700958
39,456770,0,2019-09-01 16:24:15,43.117587,131.883531,43.116623,131.897133,1.112180
40,456771,0,2019-09-01 16:26:30,43.117470,131.891722,43.120455,131.907257,1.307060
41,456772,0,2019-09-01 16:54:53,43.100621,131.894731,43.097125,131.897428,0.446153
57,456788,0,2019-09-01 18:29:35,43.111804,131.878957,43.119927,131.892575,1.429258
...,...,...,...,...,...,...,...,...
125061,581792,0,2019-09-15 18:55:08,43.127710,131.938934,43.130153,131.949599,0.909265
125073,581804,0,2019-09-15 19:25:00,43.117605,131.881390,43.118331,131.896355,1.220591
125074,581805,0,2019-09-15 19:12:43,43.102632,131.917289,43.094574,131.911369,1.016691
125080,581811,0,2019-09-15 19:23:56,43.092384,131.857471,43.096058,131.862344,0.569210


In [13]:
display(df_filtered['id'])

38        456769
39        456770
40        456771
41        456772
57        456788
           ...  
125061    581792
125073    581804
125074    581805
125080    581811
125107    581838
Name: id, Length: 7164, dtype: int64