In [128]:
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
import skfuzzy as fuzz
from skfuzzy import control as ctrl

In [129]:
nodes_info = pd.read_csv('valid_nodes_info.csv', index_col='node_id')

In [130]:
nodes_info = nodes_info.drop(['visible', 'relations_count', 'ways_count'], axis=1)

In [131]:
nodes_info

Unnamed: 0_level_0,lat,lon
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10980417,46.468524,30.738275
10980418,46.469605,30.738516
10980419,46.469758,30.736733
10980421,46.471836,30.730899
10980422,46.473256,30.731101
...,...,...
8952432738,46.468524,30.738275
8952432739,46.468524,30.738275
8952432740,46.468524,30.738275
8952432741,46.468524,30.738275


In [132]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

nodes_info_scaled = scaler.fit_transform(nodes_info)

In [133]:
nodes_info_scaled.shape

(19371, 2)

In [134]:
data = nodes_info_scaled
 
# Define the number of clusters
n_clusters = 30
 
# Apply fuzzy c-means clustering
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
    data.T, n_clusters, 2, error=0.005, maxiter=1000, init=None
)
 
# Predict cluster membership for each data point
cluster_membership = np.argmax(u, axis=0)
 
# Print the cluster centers
print('Cluster Centers:', cntr)
 
# Print the cluster membership for each data point
print('Cluster Membership:', cluster_membership)

Cluster Centers: [[0.82959566 0.70687293]
 [0.48573366 0.4891292 ]
 [0.17408041 0.50274471]
 [0.47037174 0.59890585]
 [0.39437872 0.21851579]
 [0.50902477 0.55557865]
 [0.34217805 0.6120189 ]
 [0.26599417 0.45677948]
 [0.43178259 0.4248486 ]
 [0.33634389 0.39901697]
 [0.49150082 0.4204244 ]
 [0.45967761 0.54065924]
 [0.39149263 0.56319557]
 [0.06496583 0.38367942]
 [0.47455786 0.26596143]
 [0.53414974 0.36827481]
 [0.369929   0.433799  ]
 [0.51897782 0.51320908]
 [0.69672547 0.51544666]
 [0.90391089 0.74943456]
 [0.23417929 0.50926052]
 [0.14931942 0.4380518 ]
 [0.80555196 0.89256749]
 [0.85456641 0.77318448]
 [0.56212129 0.46902702]
 [0.40872619 0.33430383]
 [0.21540881 0.43976724]
 [0.32089805 0.50104546]
 [0.37558515 0.12801767]
 [0.27325959 0.59458715]]
Cluster Membership: [11 11 11 ... 11 11 11]


In [135]:
centroids = pd.DataFrame(scaler.inverse_transform(cntr), columns=['lat', 'lon'])

In [136]:
centroids['size'] = 16

In [137]:
centroids.head(30)

Unnamed: 0,lat,lon,size
0,46.570095,30.783481,16
1,46.475562,30.724772,16
2,46.389883,30.728443,16
3,46.471338,30.754371,16
4,46.450447,30.651809,16
5,46.481965,30.742689,16
6,46.436096,30.757906,16
7,46.415152,30.71605,16
8,46.46073,30.707441,16
9,46.434492,30.700476,16


In [138]:
import plotly.express as px

# px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(centroids,
                    lat="lat",
                    lon="lon", zoom=3, height=300)

fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=4, mapbox_center_lat = 41,
    margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [139]:
pd.DataFrame(
    u.T, 
    columns=["Cluster " + str(i) for i in range(u.shape[0])]
)

Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,Cluster 8,Cluster 9,...,Cluster 20,Cluster 21,Cluster 22,Cluster 23,Cluster 24,Cluster 25,Cluster 26,Cluster 27,Cluster 28,Cluster 29
0,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,0.000166,0.000066,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
1,0.000116,0.006125,0.000220,0.005372,0.000174,0.008311,0.000938,0.000407,0.001312,0.000519,...,0.000349,0.000172,0.000078,0.000091,0.001281,0.000414,0.000261,0.000853,0.000106,0.000477
2,0.000400,0.027041,0.000765,0.015135,0.000627,0.026537,0.003084,0.001440,0.005067,0.001890,...,0.001216,0.000601,0.000266,0.000312,0.004777,0.001525,0.000921,0.003006,0.000379,0.001617
3,0.001441,0.341356,0.002686,0.031512,0.002594,0.073078,0.008870,0.005246,0.025965,0.007662,...,0.004217,0.002178,0.000933,0.001114,0.024078,0.006721,0.003359,0.010386,0.001525,0.005147
4,0.001401,0.364905,0.002463,0.030243,0.002428,0.079544,0.008053,0.004745,0.023190,0.006916,...,0.003834,0.002004,0.000899,0.001079,0.024955,0.006212,0.003067,0.009213,0.001433,0.004687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19366,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,0.000166,0.000066,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
19367,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,0.000166,0.000066,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
19368,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,0.000166,0.000066,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
19369,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,0.000166,0.000066,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061


In [140]:
nodes_info = pd.concat(
    [
        nodes_info,
        pd.DataFrame(
            u.T,
            index = nodes_info.index,
            columns=["Cluster " + str(i) for i in range(u.shape[0])]
        )
    ], axis=1
)

In [141]:
nodes_info

Unnamed: 0_level_0,lat,lon,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,...,Cluster 20,Cluster 21,Cluster 22,Cluster 23,Cluster 24,Cluster 25,Cluster 26,Cluster 27,Cluster 28,Cluster 29
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10980417,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
10980418,46.469605,30.738516,0.000116,0.006125,0.000220,0.005372,0.000174,0.008311,0.000938,0.000407,...,0.000349,0.000172,0.000078,0.000091,0.001281,0.000414,0.000261,0.000853,0.000106,0.000477
10980419,46.469758,30.736733,0.000400,0.027041,0.000765,0.015135,0.000627,0.026537,0.003084,0.001440,...,0.001216,0.000601,0.000266,0.000312,0.004777,0.001525,0.000921,0.003006,0.000379,0.001617
10980421,46.471836,30.730899,0.001441,0.341356,0.002686,0.031512,0.002594,0.073078,0.008870,0.005246,...,0.004217,0.002178,0.000933,0.001114,0.024078,0.006721,0.003359,0.010386,0.001525,0.005147
10980422,46.473256,30.731101,0.001401,0.364905,0.002463,0.030243,0.002428,0.079544,0.008053,0.004745,...,0.003834,0.002004,0.000899,0.001079,0.024955,0.006212,0.003067,0.009213,0.001433,0.004687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8952432738,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
8952432739,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
8952432740,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061
8952432741,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000044,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061


In [75]:
nodes_test_info = pd.read_csv('valid_nodes_test_info.csv', index_col='node_id')
nodes_test_info = nodes_test_info.drop(['visible', 'relations_count', 'ways_count'], axis=1)

nodes_test_info_scaled = scaler.transform(nodes_test_info)

u, u0, d, jm, p, fpc = fuzz.cluster.cmeans_predict(
    nodes_test_info_scaled.T, cntr, 2, error=0.005, maxiter=1000)

In [80]:
nodes_test_info = pd.concat(
    [
        nodes_test_info,
        pd.DataFrame(
            u.T,
            index = nodes_test_info.index,
            columns=["Cluster " + str(i) for i in range(u.shape[0])]
        )
    ], axis=1
)

In [81]:
nodes_test_info

Unnamed: 0_level_0,lat,lon,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,...,Cluster 20,Cluster 21,Cluster 22,Cluster 23,Cluster 24,Cluster 25,Cluster 26,Cluster 27,Cluster 28,Cluster 29
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10980417,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000023,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043
10980418,46.469605,30.738516,0.007064,0.004538,0.000306,0.005469,0.000209,0.000069,0.000749,0.000096,...,0.000192,0.000092,0.000362,0.000229,0.000368,0.003436,0.001027,0.964441,0.000150,0.000355
10980419,46.469758,30.736733,0.024639,0.019119,0.001162,0.026448,0.000833,0.000255,0.002888,0.000359,...,0.000729,0.000359,0.001397,0.000881,0.001368,0.012075,0.003732,0.862288,0.000572,0.001426
10980421,46.471836,30.730899,0.069186,0.108914,0.004096,0.347663,0.003649,0.000907,0.010214,0.001309,...,0.002599,0.001468,0.005186,0.003264,0.004475,0.029538,0.010997,0.236537,0.002102,0.006339
10980422,46.473256,30.731101,0.075225,0.131249,0.003721,0.370207,0.003440,0.000874,0.009050,0.001271,...,0.002382,0.001378,0.004686,0.002979,0.004078,0.025642,0.009845,0.201598,0.001933,0.005851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8952504686,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000023,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043
8952504687,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000023,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043
8952504688,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000023,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043
8952504689,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000023,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043


In [82]:
nodes_info.to_csv("nodes_info_clustered.csv")

In [83]:
nodes_test_info.to_csv("nodes_final_test_info_clustered.csv")

In [116]:
nodes_info_for_cluster = nodes_info.copy()
nodes_info_for_cluster['center'] = 0
center_dataframe = pd.DataFrame([[centroids.iloc[5, 0], centroids.iloc[5, 1], *[0 for _ in range (30)], 1]], 
                   columns=nodes_info_for_cluster.columns)
center_dataframe.iloc[0, 7] = 1
nodes_info_for_cluster = pd.concat([nodes_info_for_cluster, center_dataframe], sort=False)

In [117]:
center_dataframe

Unnamed: 0,lat,lon,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,...,Cluster 21,Cluster 22,Cluster 23,Cluster 24,Cluster 25,Cluster 26,Cluster 27,Cluster 28,Cluster 29,center
0,46.563919,30.832518,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [118]:
nodes_info_for_cluster

Unnamed: 0,lat,lon,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,...,Cluster 21,Cluster 22,Cluster 23,Cluster 24,Cluster 25,Cluster 26,Cluster 27,Cluster 28,Cluster 29,center
10980417,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043,0
10980418,46.469605,30.738516,0.007064,0.004538,0.000306,0.005469,0.000209,0.000069,0.000749,0.000096,...,0.000092,0.000362,0.000229,0.000368,0.003436,0.001027,0.964441,0.000150,0.000355,0
10980419,46.469758,30.736733,0.024639,0.019119,0.001162,0.026448,0.000833,0.000255,0.002888,0.000359,...,0.000359,0.001397,0.000881,0.001368,0.012075,0.003732,0.862288,0.000572,0.001426,0
10980421,46.471836,30.730899,0.069186,0.108914,0.004096,0.347663,0.003649,0.000907,0.010214,0.001309,...,0.001468,0.005186,0.003264,0.004475,0.029538,0.010997,0.236537,0.002102,0.006339,0
10980422,46.473256,30.731101,0.075225,0.131249,0.003721,0.370207,0.003440,0.000874,0.009050,0.001271,...,0.001378,0.004686,0.002979,0.004078,0.025642,0.009845,0.201598,0.001933,0.005851,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8952432739,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043,0
8952432740,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043,0
8952432741,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043,0
8952432742,46.468524,30.738275,0.000709,0.000482,0.000037,0.000625,0.000025,0.000008,0.000093,0.000011,...,0.000011,0.000044,0.000028,0.000045,0.000449,0.000128,0.995967,0.000018,0.000043,0


In [119]:
fig = px.scatter_mapbox(nodes_info_for_cluster,
                    lat="lat",
                    lon="lon", opacity=[i for i in nodes_info_for_cluster['Cluster 5'].values],
                    color_continuous_scale="rdgy", color="center", zoom=3, height=300)

fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=4, mapbox_center_lat = 41,
    margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [144]:
nodes_info['Cluster num'] = np.argmax(u, axis=0)

In [145]:
nodes_info

Unnamed: 0_level_0,lat,lon,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,...,Cluster 21,Cluster 22,Cluster 23,Cluster 24,Cluster 25,Cluster 26,Cluster 27,Cluster 28,Cluster 29,Cluster num
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10980417,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061,11
10980418,46.469605,30.738516,0.000116,0.006125,0.000220,0.005372,0.000174,0.008311,0.000938,0.000407,...,0.000172,0.000078,0.000091,0.001281,0.000414,0.000261,0.000853,0.000106,0.000477,11
10980419,46.469758,30.736733,0.000400,0.027041,0.000765,0.015135,0.000627,0.026537,0.003084,0.001440,...,0.000601,0.000266,0.000312,0.004777,0.001525,0.000921,0.003006,0.000379,0.001617,11
10980421,46.471836,30.730899,0.001441,0.341356,0.002686,0.031512,0.002594,0.073078,0.008870,0.005246,...,0.002178,0.000933,0.001114,0.024078,0.006721,0.003359,0.010386,0.001525,0.005147,1
10980422,46.473256,30.731101,0.001401,0.364905,0.002463,0.030243,0.002428,0.079544,0.008053,0.004745,...,0.002004,0.000899,0.001079,0.024955,0.006212,0.003067,0.009213,0.001433,0.004687,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8952432738,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061,11
8952432739,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061,11
8952432740,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061,11
8952432741,46.468524,30.738275,0.000014,0.000729,0.000028,0.000629,0.000022,0.000867,0.000120,0.000052,...,0.000022,0.000009,0.000011,0.000150,0.000052,0.000033,0.000111,0.000013,0.000061,11


In [165]:
nodes = pd.read_csv('nodes.csv')
nodes = nodes[(nodes['node_start'].isin(nodes_info.index)) & (nodes['node_finish'].isin(nodes_info.index))]

In [166]:
nodes

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,-2627062893189810184,10980432,2133368107,17.414917,32.0
1,-2627062893189810184,10980433,5212387954,17.186539,26.0
2,-2627062893189810184,10980445,5221700954,28.513481,26.0
3,-2627062893189810184,10980498,10980445,154.266122,25.0
4,-2627062893189810184,10980647,1986137911,8.542824,29.0
...,...,...,...,...,...
480281,-8229597404562288405,4439629611,317187499,157.791878,19.0
480282,-8229597404562288405,4768348190,7113286188,50.972612,21.0
480283,-8229597404562288405,4768348195,4768348190,89.120989,22.0
480284,-8229597404562288405,4768348532,4768348195,116.657109,25.0


In [173]:
orders = pd.read_csv('orders.csv', index_col='Id')

In [158]:
test_orders = pd.read_csv('test.csv', index_col='Id')

In [163]:
start_time = pd.concat([orders, test_orders])['running_time']

In [167]:
nodes = nodes.join(start_time, on='Id')

In [170]:
nodes

Unnamed: 0,Id,node_start,node_finish,distance,speed,running_time
0,-2627062893189810184,10980432,2133368107,17.414917,32.0,2022-01-24 17:21:01
1,-2627062893189810184,10980433,5212387954,17.186539,26.0,2022-01-24 17:21:01
2,-2627062893189810184,10980445,5221700954,28.513481,26.0,2022-01-24 17:21:01
3,-2627062893189810184,10980498,10980445,154.266122,25.0,2022-01-24 17:21:01
4,-2627062893189810184,10980647,1986137911,8.542824,29.0,2022-01-24 17:21:01
...,...,...,...,...,...,...
480281,-8229597404562288405,4439629611,317187499,157.791878,19.0,2022-01-24 11:48:51
480282,-8229597404562288405,4768348190,7113286188,50.972612,21.0,2022-01-24 11:48:51
480283,-8229597404562288405,4768348195,4768348190,89.120989,22.0,2022-01-24 11:48:51
480284,-8229597404562288405,4768348532,4768348195,116.657109,25.0,2022-01-24 11:48:51


In [171]:
nodes = nodes.join(nodes_info['Cluster num'], on='node_start')
nodes = nodes.rename({'Cluster num': 'node_start_cluster'}, axis=1)
nodes = nodes.join(nodes_info['Cluster num'], on='node_finish')
nodes = nodes.rename({'Cluster num': 'node_finish_cluster'}, axis=1)

In [172]:
nodes

Unnamed: 0,Id,node_start,node_finish,distance,speed,running_time,node_start_cluster,node_finish_cluster
0,-2627062893189810184,10980432,2133368107,17.414917,32.0,2022-01-24 17:21:01,11,11
1,-2627062893189810184,10980433,5212387954,17.186539,26.0,2022-01-24 17:21:01,11,11
2,-2627062893189810184,10980445,5221700954,28.513481,26.0,2022-01-24 17:21:01,3,3
3,-2627062893189810184,10980498,10980445,154.266122,25.0,2022-01-24 17:21:01,3,3
4,-2627062893189810184,10980647,1986137911,8.542824,29.0,2022-01-24 17:21:01,3,3
...,...,...,...,...,...,...,...,...
480281,-8229597404562288405,4439629611,317187499,157.791878,19.0,2022-01-24 11:48:51,1,1
480282,-8229597404562288405,4768348190,7113286188,50.972612,21.0,2022-01-24 11:48:51,1,1
480283,-8229597404562288405,4768348195,4768348190,89.120989,22.0,2022-01-24 11:48:51,1,1
480284,-8229597404562288405,4768348532,4768348195,116.657109,25.0,2022-01-24 11:48:51,1,1


In [176]:
input = pd.read_csv('final_df_start_center_finish_pca.csv', index_col='Unnamed: 0')

In [177]:
input

Unnamed: 0,Id,running_time,completed_time,route_distance_km,delta_time,is_afternoon,is_evening,is_morning,is_night,morning_peak_time,...,center_cluster_26,center_cluster_27,center_cluster_28,center_cluster_29,start_pca0,start_pca1,finish_pca0,finish_pca1,center_pca0,center_pca1
0,-4773019581999572651,2022-01-24 18:30:21,2022-01-24 18:44:43,3.740,862.0,0,1,0,0,0,...,0.001009,0.965341,0.000146,0.000343,0.022674,0.002687,-0.003298,-0.006111,0.009688,-0.001712
1,-7575630690398473489,2022-01-24 06:53:53,2022-01-24 07:06:26,3.526,753.0,0,0,1,0,0,...,0.002630,0.028300,0.000603,0.002082,0.020041,-0.007313,-0.004006,0.015119,0.008018,0.003903
2,-6264582368520213833,2022-01-24 10:00:59,2022-01-24 10:15:58,5.071,899.0,0,0,1,0,0,...,0.009700,0.232100,0.001619,0.003763,-0.002792,-0.006090,0.031984,-0.002009,0.014596,-0.004049
3,5964315354301636538,2022-01-24 14:28:05,2022-01-24 14:35:08,2.867,423.0,1,0,0,0,0,...,0.001445,0.002135,0.000686,0.001092,0.133338,-0.016324,0.123455,0.004813,0.128397,-0.005755
4,1372379574816145639,2022-01-24 11:57:29,2022-01-24 12:06:29,3.751,540.0,0,0,1,0,0,...,0.001769,0.002392,0.001492,0.896434,-0.024061,0.013013,-0.041836,0.037430,-0.032948,0.025221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4992,7096714159023973792,2022-01-24 21:10:38,2022-01-24 21:23:53,7.397,795.0,0,0,0,1,0,...,0.035864,0.103321,0.003310,0.005211,0.010001,-0.005469,-0.039235,0.014532,-0.014617,0.004531
4993,-3836026425968071806,2022-01-24 15:10:27,2022-01-24 15:20:21,1.948,594.0,1,0,0,0,0,...,0.214723,0.037090,0.004636,0.005904,-0.018507,-0.040103,-0.018507,-0.040103,-0.018507,-0.040103
4994,2926216435675216636,2022-01-24 13:57:04,2022-01-24 14:03:18,2.547,374.0,1,0,0,0,0,...,0.020623,0.014552,0.026120,0.036224,0.008734,0.017353,-0.000601,0.040633,0.004066,0.028993
4995,-6677307480063489707,2022-01-24 08:46:13,2022-01-24 08:55:34,3.013,561.0,0,0,1,0,1,...,0.031106,0.018309,0.011790,0.017896,0.002130,-0.018799,0.014537,-0.005426,0.008333,-0.012113


In [182]:
import datetime

In [199]:
def generate_calc_mean_speed_func(i):
    cluster_data = nodes[(nodes['node_start_cluster'] == i) | (nodes['node_finish_cluster'] == i)]
    def calc_mean_speed_last_hour(row):
        end_time = row['running_time']
        end_time_str = end_time
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        if end_time.hour >= 1:
            start_time = datetime.datetime(end_time.year, end_time.month, end_time.day, end_time.hour - 1, end_time.minute, end_time.second)
        else:
            start_time = datetime.datetime(end_time.year, end_time.month, end_time.day - 1, 23, end_time.minute, end_time.second)

        valid_data = cluster_data[(nodes['running_time'] < end_time_str) & (nodes['running_time'] > str(start_time))]
        
        return 0 if len(valid_data) == 0 else valid_data['speed'].mean()
    
    return calc_mean_speed_last_hour

In [203]:
def generate_calc_calc_taxi_num_func(i):
    cluster_data = nodes[(nodes['node_start_cluster'] == i) | (nodes['node_finish_cluster'] == i)]
    def calc_taxi_num_last_hour(row):
        end_time = row['running_time']
        end_time_str = end_time
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        if end_time.hour >= 1:
            start_time = datetime.datetime(end_time.year, end_time.month, end_time.day, end_time.hour - 1, end_time.minute, end_time.second)
        else:
            start_time = datetime.datetime(end_time.year, end_time.month, end_time.day - 1, 23, end_time.minute, end_time.second)

        valid_data = cluster_data[(nodes['running_time'] < end_time_str) & (nodes['running_time'] > str(start_time))]
        
        return len(valid_data['Id'].unique())
    
    return calc_taxi_num_last_hour

In [273]:
global time_data
def generate_calc_mean_speed_taxi_num_func(i):
    cluster_data = nodes[(nodes['node_start_cluster'] == i) | (nodes['node_finish_cluster'] == i)]
    
    start = nodes['running_time'].astype('datetime64[ns]').min()
    start_without_secs = datetime.datetime(start.year, start.month, start.day, start.hour, start.minute, 0)
    # end = nodes['running_time'].astype('datetime64[ns]').max()
    global time_data
    time_data = pd.DataFrame([[str(start_without_secs), 0, 0]], columns=['time', 'mean_speed', 'num_vehicles'])

    time_data = time_data.set_index('time')

    def calc_mean_speed(row):
        global time_data
        end_time = row['running_time']
        end_time_str = end_time
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        end_time_without_secs = datetime.datetime(end_time.year, end_time.month, end_time.day, end_time.hour, end_time.minute, 0)
        if end_time_without_secs not in time_data.index:
            if end_time.hour >= 1:
                start_time = datetime.datetime(end_time.year, end_time.month, end_time.day, end_time.hour - 1, end_time.minute, 0)
            else:
                start_time = datetime.datetime(end_time.year, end_time.month, end_time.day - 1, 23, end_time.minute, 0)

            valid_data = cluster_data[(nodes['running_time'] < end_time_str) & (nodes['running_time'] > str(start_time))]

            new_row = pd.DataFrame({'time': [str(end_time_without_secs)],
                                    'mean_speed': [0 if len(valid_data) == 0 else valid_data['speed'].mean()],
                                    'num_vehicles': [len(valid_data['Id'].unique())]})
            new_row = new_row.set_index('time')

            time_data = pd.concat([time_data, new_row], ignore_index=False)
        
        return time_data.loc[str(end_time_without_secs), 'mean_speed']
    
    def calc_taxi_num_last_hour(row):
        global time_data
        end_time = row['running_time']
        end_time_str = end_time
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
        end_time_without_secs = datetime.datetime(end_time.year, end_time.month, end_time.day, end_time.hour, end_time.minute, 0)
        if end_time_without_secs not in time_data.index:
            if end_time.hour >= 1:
                start_time = datetime.datetime(end_time.year, end_time.month, end_time.day, end_time.hour - 1, end_time.minute, 0)
            else:
                start_time = datetime.datetime(end_time.year, end_time.month, end_time.day - 1, 23, end_time.minute, 0)

            valid_data = cluster_data[(nodes['running_time'] < end_time_str) & (nodes['running_time'] > str(start_time))]

            new_row = pd.DataFrame({'time': [str(end_time_without_secs)],
                                    'mean_speed': [0 if len(valid_data) == 0 else valid_data['speed'].mean()],
                                    'num_vehicles': [len(valid_data['Id'].unique())]})
            
            new_row = new_row.set_index('time')
            
            time_data = pd.concat([time_data, new_row], ignore_index=False)

        return time_data.loc[str(end_time_without_secs), 'num_vehicles']
    
    return (calc_mean_speed, calc_taxi_num_last_hour)

In [274]:
for i in tqdm.tqdm(range(6, 7)):
    calc_mean_speed, calc_taxi_num_last_hour = generate_calc_mean_speed_taxi_num_func(i)
    input[f'cluster_{i}_mean_speed_last_hour'] = input.apply(calc_mean_speed, axis = 1)
    input[f'cluster_{i}_num_taxis_last_hour'] = input.apply(calc_taxi_num_last_hour, axis = 1)
    # input[] = input.apply(generate_calc_calc_taxi_num_func(i), axis = 1)


Boolean Series key will be reindexed to match DataFrame index.

  0%|          | 0/1 [00:00<?, ?it/s]

                     mean_speed  num_vehicles
time                                         
2022-01-24 00:30:00           0             0





KeyError: "None of ['time'] are in the columns"

In [None]:
input['cluster_0_num_vehicles_last_hour'].mean()

33.132484041937154

In [221]:
print(nodes['running_time'].astype('datetime64[ns]').min())

2022-01-24 00:30:04
