# Data Preparation

## Test and train orders info processing

In [67]:
import numpy as np
import pandas as pd

Read csv files

In [68]:
orders_test_dfrm = pd.read_csv('final_test.csv')
orders_test_dfrm

Unnamed: 0,Id,running_time,route_distance_km
0,6198,2022-01-24 03:38:30,4.744
1,6417,2022-01-24 03:45:51,6.279
2,7054,2022-01-24 03:52:14,3.934
3,9628,2022-01-24 04:03:21,5.959
4,10283,2022-01-24 04:01:12,7.028
...,...,...,...
995,525706,2022-01-24 18:46:17,2.897
996,526604,2022-01-24 18:46:44,3.482
997,527213,2022-01-24 18:47:25,3.486
998,527520,2022-01-24 18:52:01,0.703


In [120]:
nodes_test_dfrm = pd.read_csv('nodes_test.csv')
nodes_test_dfrm

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,6198,8952394129,8952394128,138.795710,37.0
1,6198,2059503754,4548172320,95.273001,33.0
2,6198,2059504508,2059503754,137.647881,35.0
3,6198,1570776534,1977018578,4.383708,30.0
4,6198,1977018580,1977018576,24.195593,31.0
...,...,...,...,...,...
102491,527850,290891780,5957304897,40.933260,45.0
102492,527850,5957304897,3902949792,63.421598,34.0
102493,527850,3902949792,290404192,4.303810,16.0
102494,527850,290404192,5957304888,5.250640,37.0


Convert to datetime running_time feature

In [121]:
orders_test_dfrm['running_time'] = pd.to_datetime(orders_test_dfrm['running_time'])

Check and replace negative ids

In [122]:
orders_test_dfrm["Id"] = np.abs(orders_test_dfrm["Id"])
nodes_test_dfrm["Id"] = np.abs(nodes_test_dfrm["Id"])

Create new feature - time_of_day (diving hours of order to 6 categories)

In [123]:
hours = orders_dfrm["running_time"].apply(lambda x: x.hour)
orders_dfrm["hours"] = hours.values
bins = [0, 5, 10, 13, 16, 20, 23]
names = ['night', 'morning', 'afternoon_1', 'afternoon_2', 'evening_1', 'evening_2']

orders_dfrm['time_of_day'] = pd.cut(orders_dfrm['hours'], bins, labels=names)       

Create new features - wheather forecasting. Using the meteostat library, we select the most frequent events on the day in question

In [124]:
from sklearn.impute import KNNImputer

imputer = KNNImputer()
imputed = imputer.fit_transform(nodes_dfrm)
df_nodes_new = pd.DataFrame(imputed, columns=nodes_dfrm.columns)
df_nodes_new.head()

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,6198.0,8952394000.0,8952394000.0,138.79571,37.0
1,6198.0,2059504000.0,4548172000.0,95.273001,33.0
2,6198.0,2059505000.0,2059504000.0,137.647881,35.0
3,6198.0,1570777000.0,1977019000.0,4.383708,30.0
4,6198.0,1977019000.0,1977019000.0,24.195593,31.0


In [125]:
from meteostat import Point,Hourly, Stations
import osmapi as osm
from datetime import timedelta

In [145]:
api = osm.OsmApi()
node = api.NodeGet(nodes_dfrm['node_start'][6])

stations = Stations()
stations = stations.nearby(node['lat'], node['lon'])
station = stations.fetch(1)

In [146]:
w = Hourly(station, orders_dfrm["running_time"].min() - timedelta(hours=0,minutes=35), orders_dfrm["running_time"].max())

In [147]:
w = w.fetch()

In [148]:
w["coco"].unique()
weather_types = {
    3:"Cloudy",
    4:"Overcast",
    5:"Fog",
    21:"Snow Shower",
    22: "Heavy Snow Shower"
}

In [149]:
w.index = pd.to_datetime(w.index)
w["hour"] = w.index.hour
df_orders_new = orders_dfrm.merge(w[["hour", "coco"]], left_on = "hours", right_on="hour", how="outer")

df_orders_new["hour"] = df_orders_new["hour"].fillna(3)
df_orders_new["coco"] = df_orders_new["coco"].fillna(21)

df_orders_new.drop(columns=["hour"], inplace=True)
df_orders_new["coco"] = df_orders_new["coco"].replace(weather_types)
df_orders_w = pd.get_dummies(df_orders_new, prefix='', prefix_sep='')
df_orders_w

Unnamed: 0,Id,running_time,route_distance_km,hours,night,morning,afternoon_1,afternoon_2,evening_1,evening_2,Fog,Heavy Snow Shower,Overcast,Snow Shower
0,6198,2022-01-24 03:38:30,4.744,3,1,0,0,0,0,0,0,0,0,1
1,6417,2022-01-24 03:45:51,6.279,3,1,0,0,0,0,0,0,0,0,1
2,7054,2022-01-24 03:52:14,3.934,3,1,0,0,0,0,0,0,0,0,1
3,9628,2022-01-24 04:03:21,5.959,4,1,0,0,0,0,0,0,0,0,1
4,10283,2022-01-24 04:01:12,7.028,4,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,525706,2022-01-24 18:46:17,2.897,18,0,0,0,0,1,0,0,0,0,1
996,526604,2022-01-24 18:46:44,3.482,18,0,0,0,0,1,0,0,0,0,1
997,527213,2022-01-24 18:47:25,3.486,18,0,0,0,0,1,0,0,0,0,1
998,527520,2022-01-24 18:52:01,0.703,18,0,0,0,0,1,0,0,0,0,1


# Nodes.csv processing

Counting general distance and approximate time for every order

In [150]:
ids = list(nodes_dfrm["Id"].unique())
dict_with_approx_time = {}
dict_with_lists_of_finish = {}
dict_with_start = {}
dict_with_distances = {}

for idx, row in nodes_dfrm.iterrows():
    if row["Id"] not in dict_with_approx_time.keys():
        dict_with_approx_time[row["Id"]] = row["distance"] / row["speed"]
        dict_with_distances[row["Id"]] = row["distance"]
        dict_with_start[row["Id"]] = row["node_start"]
        dict_with_lists_of_finish[row["Id"]] = [row["node_finish"]]
        
    else:
        dict_with_approx_time[row["Id"]] += (row["distance"] / row["speed"])
        dict_with_distances[row["Id"]] += row["distance"]
        dict_with_lists_of_finish[row["Id"]].append(row["node_finish"])
        
dict_with_avg_speed = {}
dict_with_finish = {}

for k,v in dict_with_lists_of_finish.items():
    dict_with_finish[k] = v[-1]

for k,v in dict_with_distances.items():
    dict_with_avg_speed[k] = (v /  dict_with_approx_time[k])

In [151]:
start_dfrm = pd.DataFrame.from_dict(dict_with_start, orient='index').reset_index()
start_dfrm.columns = ["Id", "node_start"]
finish_dfrm = pd.DataFrame.from_dict(dict_with_finish, orient='index').reset_index()
finish_dfrm.columns = ["Id", "node_finish"]
distances_dfrm = pd.DataFrame.from_dict(dict_with_distances, orient='index').reset_index()
distances_dfrm.columns = ["Id", "avg_distance"]
time_dfrm = pd.DataFrame.from_dict(dict_with_approx_time, orient='index').reset_index()
time_dfrm.columns = ["Id", "avg_time"]

new_nodes_dfrm = pd.merge(start_dfrm, finish_dfrm, on=["Id"], how="left")
new_nodes_dfrm = pd.merge(new_nodes_dfrm, distances_dfrm, on=["Id"], how="left")
new_nodes_dfrm = pd.merge(new_nodes_dfrm, time_dfrm, on=["Id"], how="left")

new_nodes_dfrm['node_finish'] = new_nodes_dfrm['node_finish'].astype(np.int64)
new_nodes_dfrm['node_start'] = new_nodes_dfrm['node_start'].astype(np.int64)
new_nodes_dfrm['Id'] = new_nodes_dfrm['Id'].astype(np.int64)

In [152]:
new_nodes_dfrm["avg_time"] = new_nodes_dfrm["avg_time"].fillna(0)
new_nodes_dfrm

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time
0,6198,8952394129,6878011682,4706.362035,108.636935
1,6417,290008230,1262365786,6238.545582,145.783253
2,7054,1984088902,317189357,3905.904002,107.683743
3,9628,290941343,290897544,5949.408002,155.985143
4,10283,290941344,7878405269,7014.933334,181.875917
...,...,...,...,...,...
995,525706,290800924,3127870582,2895.784059,79.544906
996,526604,4775833861,4909437297,3483.059160,99.002724
997,527213,4807102920,4732308198,3440.951601,100.224202
998,527520,5966955830,4491475140,702.519259,21.358506


Using osmapi parsing latitude and longitude of node location

In [153]:
import osmapi as osm

api = osm.OsmApi()

lat_start_feature = []
lon_start_feature = []

lat_finish_feature = []
lon_finish_feature = []

new_id = []
new_node_start = []
new_node_finish = []
new_avg_distances = []
new_avg_time = []


elements_not_found_or_deleted = 0

for idx, row in new_nodes_dfrm.iterrows():
    try:
        start_node = api.NodeGet(int(row['node_start']))
        lat_start_feature.append(start_node['lat'])
        lon_start_feature.append(start_node['lon'])
        
    except osm.ElementDeletedApiError:
        elements_not_found_or_deleted += 1
        lat_start_feature.append(0)
        lon_start_feature.append(0)
    except osm.ElementNotFoundApiError:
        elements_not_found_or_deleted += 1
        lat_start_feature.append(0)
        lon_start_feature.append(0)
            
    try:    
        finish_node = api.NodeGet(int(row['node_finish']))
        lat_finish_feature.append(finish_node['lat'])
        lon_finish_feature.append(finish_node['lon'])
    except osm.ElementDeletedApiError:
        elements_not_found_or_deleted += 1
        lat_finish_feature.append(0)
        lon_finish_feature.append(0)
    except osm.ElementNotFoundApiError:
        elements_not_found_or_deleted += 1
        lat_finish_feature.append(0)
        lon_finish_feature.append(0)
        
    new_id.append(row['Id'])    
    new_avg_time.append(row['avg_time'])
    new_avg_distances.append(row['avg_distance'])
    new_node_start.append(int(row['node_start']))
    new_node_finish.append(int(row['node_finish']))

In [44]:
new_nodes_dict = {}

new_nodes_dict["Id"] = new_id
new_nodes_dict["node_start"] = new_node_start
new_nodes_dict["node_finish"] = new_node_finish
new_nodes_dict["avg_distance"] = new_avg_distances
new_nodes_dict["avg_time"] = new_avg_time

new_nodes_dict["lat_start"] = lat_start_feature
new_nodes_dict["lon_start"] = lon_start_feature

new_nodes_dict["lat_finish"] = lat_start_feature
new_nodes_dict["lon_finish"] = lon_finish_feature

updated_nodes_dfrm = pd.DataFrame.from_dict(new_nodes_dict)
updated_nodes_dfrm

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time,lat_start,lon_start,lat_finish,lon_finish
0,1.403211e+18,290773715,4768528694,5147.066238,169.263939,46.438158,30.724242,46.438158,30.728830
1,5.763551e+18,10980432,3719876029,5621.990105,186.073913,46.472665,30.739248,46.472665,0.000000
2,5.443825e+18,274917390,5218361665,3120.920323,0.000000,46.479656,30.707868,46.479656,30.696352
3,4.695904e+18,290800926,482648118,4697.003242,143.882348,46.438810,30.712162,46.438810,30.718724
4,8.978881e+17,27126445,1987168307,1764.180784,60.399808,46.417304,30.726467,46.417304,30.737505
...,...,...,...,...,...,...,...,...,...
5995,1.281328e+18,27126477,27126488,5552.756776,170.135061,46.427259,30.752251,46.427259,30.760893
5996,4.331594e+18,10980427,4773378423,5506.384590,142.414984,46.468440,30.739611,46.468440,30.747758
5997,7.030316e+18,290800917,8952394133,1720.922522,48.889626,46.432186,30.716974,46.432186,10.365401
5998,2.069922e+18,10980464,4768348532,947.986207,42.532845,46.471040,30.727314,46.471040,30.717741


Convert avg_time and avg_distance to km/hour and km respectively

In [45]:
updated_nodes_dfrm["avg_time"] = updated_nodes_dfrm["avg_time"].apply(lambda x: (x * 1000) / 3600)
updated_nodes_dfrm["avg_distance"] = updated_nodes_dfrm["avg_distance"].apply(lambda x: (x / 1000))

Save preproccesed data

In [52]:
updated_nodes_dfrm.to_csv('new_nodes.csv')

### Counting distances between center of Odessa and locations

In [59]:
import geopy.distance
import urllib
import json

oddesa_center = (46.482952, 30.712481)

regions_start = []
regions_finish = []

centr_distance_st = []
centr_distance_fin = []

for idx, row in updated_nodes_dfrm.iterrows():
    if int(row['lat_start']) != 0 and int(row['lon_start']) != 0:
        centr_distance_st.append(geopy.distance.geodesic((row['lat_start'], row['lon_start']), oddesa_center).km)       
    else:
        centr_distance_st.append(20)
        
    if int(row['lat_finish']) != 0 and int(row['lon_finish']) != 0:
        centr_distance_fin.append(geopy.distance.geodesic((row['lat_start'], row['lon_finish']), oddesa_center).km)       
    else:
        centr_distance_fin.append(20)

In [60]:
updated_nodes_dfrm["centr_distance_st"] = centr_distance_st
updated_nodes_dfrm["centr_distance_fin"] = centr_distance_fin

In [61]:
updated_nodes_dfrm

Unnamed: 0,Id,node_start,node_finish,avg_distance,avg_time,lat_start,lon_start,lat_finish,lon_finish,centr_distance_st,centr_distance_fin
0,1.403211e+18,290773715,4768528694,5.147066,47.017761,46.438158,30.724242,46.438158,30.728830,5.060632,5.135280
1,5.763551e+18,10980432,3719876029,5.621990,51.687198,46.472665,30.739248,46.472665,0.000000,2.352199,20.000000
2,5.443825e+18,274917390,5218361665,3.120920,0.000000,46.479656,30.707868,46.479656,30.696352,0.509588,1.291575
3,4.695904e+18,290800926,482648118,4.697003,39.967319,46.438810,30.712162,46.438810,30.718724,4.906856,4.930175
4,8.978881e+17,27126445,1987168307,1.764181,16.777725,46.417304,30.726467,46.417304,30.737505,7.376104,7.546448
...,...,...,...,...,...,...,...,...,...,...,...
5995,1.281328e+18,27126477,27126488,5.552757,47.259739,46.427259,30.752251,46.427259,30.760893,6.903768,7.222168
5996,4.331594e+18,10980427,4773378423,5.506385,39.559718,46.468440,30.739611,46.468440,30.747758,2.635005,3.153057
5997,7.030316e+18,290800917,8952394133,1.720923,13.580452,46.432186,30.716974,46.432186,10.365401,5.653728,1558.781118
5998,2.069922e+18,10980464,4768348532,0.947986,11.814679,46.471040,30.727314,46.471040,30.717741,1.746690,1.384377


Save preproccesed data

In [None]:
updated_nodes_dfrm.to_csv("updated_nodes_dfrm.csv")

### Parsing raions of locations using Google Maps API

In [None]:
import geopy.distance
import urllib
import json


regions_start = []
regions_finish = []


for idx, row in updated_nodes_dfrm.iterrows():
    if int(row['lat_start']) != 0 and int(row['lon_start']) != 0:
        link = "https://maps.googleapis.com/maps/api/geocode/json?latlng=" + str(row['lat_start']) + ","+ str(row['lon_start']) + "&key=AIzaSyDNAlAn-Kd_OsoR7gisnOwrxhingfX7wRA"
        with urllib.request.urlopen(link) as url:
                data = json.load(url)
        if data["results"] and len(data["results"]) > 1 and len(data["results"][0]["address_components"]) > 1:
            regions_start.append(data["results"][0]["address_components"][1]["long_name"])
        else:
            regions_start.append("unknown_region")
    else:
        regions_start.append("unknown_region")
        
    if int(row['lat_finish']) != 0 and int(row['lon_finish']) != 0:      
        link = "https://maps.googleapis.com/maps/api/geocode/json?latlng=" +  str(row['lat_start']) + ","+ str(row['lon_finish']) + "&key=AIzaSyDNAlAn-Kd_OsoR7gisnOwrxhingfX7wRA"
        with urllib.request.urlopen(link) as url:
                data = json.load(url)
        if data["results"] and len(data["results"]) > 1 and len(data["results"][0]["address_components"]) > 1:
            regions_finish.append(data["results"][0]["address_components"][1]["long_name"])
        else:
            regions_finish.append("unknown_region")
    else:
        regions_finish.append("unknown_region")

Next we can delete coordinates of locations

In [None]:
updated_nodes_dfrm = updated_nodes_dfrm.drop(["lat_start", "lon_start", "lat_finish", "lon_finish"], axis=1)
updated_nodes_dfrm

Use One-Hot Encoding approach to get unique raions in Odessa

In [None]:
temp = pd.get_dummies(updated_nodes_dfrm["regions_start"], prefix='', prefix_sep='')
temp

In [None]:
final_nodes = pd.concat([updated_nodes_dfrm,temp], axis=1)
final_nodes = final_nodes.drop(["regions_start", "regions_finish"], axis=1)

Save data for further work

In [None]:
final_nodes.to_csv("final_train_nodes.csv")

 ## The same code was applied for processing nodes_test.csv  