In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import pickle

import fiona
from itertools import chain

import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from mpl_toolkits.basemap import Basemap
from shapely.geometry import Point, Polygon, MultiPoint
from descartes import PolygonPatch

import matplotlib.colors as mpl_colors
from random import randint
import time

from geopy.distance import vincenty

from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Setting up environment

In [None]:
img_width_inches = 7.22 * 3
img_height_inches = 5.25 * 3

stadium_coords = 30.28725, 59.95271

In [None]:
with open('./data_routes_pickle/cell_id_lac_info', 'rb') as f:
    station_dict = pickle.load(f)
    
with open('./data_routes_pickle/routes_user_id', 'rb') as f:
    routes_user_id = pickle.load(f)

with open('./data_routes_pickle/routes_coord', 'rb') as f:
    routes_coord = pickle.load(f)

with open('./data_routes_pickle/routes_cell_id', 'rb') as f:
    routes_station = pickle.load(f)

### Reading map (really long operation)

In [None]:
m.readshapefile(
    'data_maps_input/spb',
    'city_polygons',
    drawbounds=False)

df_map = pd.DataFrame({
    'poly': [Polygon(polygon_coords) for polygon_coords in m.city_polygons]})

df_map['patches'] = df_map['poly'].map(lambda x: PolygonPatch(
    x,
    fc='black',
    ec='white', lw=0.55, alpha=1.,
    zorder=2))

### Creating basemap, setting up scale etc.

In [None]:
shp = fiona.open('data_maps_input/spb.shp')
bds = shp.bounds
shp.close()

ll = bds[0], bds[1]
ur = bds[2], bds[3]
coords = list(chain(ll, ur))
w, h = coords[2] - coords[0], coords[3] - coords[1]
zoom_out_frac = -0.333

# new_coords_2 = coords[2] + (coords[2] - coords[0]) * 0.06

m = Basemap(
    projection='tmerc',
    lon_0=30.5,
    lat_0=60.,
    ellps='WGS84',
    llcrnrlon=coords[0] + (coords[2] - coords[0]) * 0.06 - zoom_out_frac * w,
    llcrnrlat=coords[1] - zoom_out_frac * h,
    urcrnrlon=coords[2] + (coords[2] - coords[0]) * 0.06 + zoom_out_frac * w,
    urcrnrlat=coords[3] + zoom_out_frac * h,
    lat_ts=0,
    resolution='i',
    suppress_ticks=True)

In [None]:
map_points = pd.Series(
    [Point(m(mapped_x, mapped_y)) for mapped_x, mapped_y in 
     [(station_dict[i][0], station_dict[i][1]) for i in station_dict]])

station_points = MultiPoint(list(map_points.values))

# Filtering

### Current routes resetting

In [None]:
routes_coord_cur = routes_coord

### Filtering by total distance

In [None]:
dists_total = [0.0] * len(routes_coord)
for i in range(len(routes_coord)):
    path = routes_coord[i]
    for j in range(len(path) - 1):
        point1, point2 = path[j], path[j + 1]
        dists_total[i] += vincenty(point1, point2).miles

In [None]:
dist_total_idxs = [i for i, dist in enumerate(dists) if dist < 10]

routes_station_filtered_dist_total = [routes_station[i] for i in dist_total_idxs]

def routes_coord_filter_dist_total(routes_coord):
    return [routes_coord[i] for i in dist_total_idxs]

routes_coord_cur = routes_coord_filter_dist_total(routes_coord_cur)

### Filtering by any segment distance

In [None]:
threshold_route_any = 3

def filter_dist_any(routes_coord):    
    routes_coord_filtered_dist_any_intl = []

    removed_path_count = 0
    for i in range(len(routes_coord)):
        path = routes_coord[i]
        bad_path = False

        # If at least one distance is bigger than threshold, remove path
        for j in range(0, len(path) - 1, 1):
            point1, point2 = path[j], path[j + 1]
            cur_dist = vincenty(point1, point2).miles
            if cur_dist > threshold_route_any:
                bad_path = True

        if bad_path:
            removed_path_count += 1
        else:
            routes_coord_filtered_dist_any_intl.append(path)

    print("Removed " + str(removed_path_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_filtered_dist_any_intl

routes_coord_cur = filter_dist_any(routes_coord_cur)

### Filtering by stadium-closest point distance

In [None]:
threshold_route_closest = 0.5

def filter_closest(routes_coord):
    def dist_to_stadium(point):
        return vincenty(stadium_coords, point).miles
    
    routes_coord_filtered_closest_intl = []

    removed_path_count = 0
    for path in routes_coord:
        bad_path = True

        # At least one point is close to the stadium
        for i in range(len(path)):
            if dist_to_stadium(path[i]) < threshold_route_closest:
                bad_path = False

        if bad_path:
            removed_path_count += 1
        else:
            routes_coord_filtered_closest_intl.append(path)
    
    print("Removed " + str(removed_path_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_filtered_closest_intl

routes_coord_cur = filter_closest(routes_coord_cur)

# Plotting current route data

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

m.scatter(
    [geom.x for geom in list(station_points)],
    [geom.y for geom in list(station_points)],
    5, marker='.', lw=.0,
    facecolor='#33ccff', edgecolor='w',
    alpha=0.9, antialiased=True, zorder=3)

# colors = ['red', 'white', 'blue', 'yellow', 'green']
colors = list(mpl_colors.cnames.keys())
cur_color = colors[0]

# paths = routes_coord
paths = routes_coord_cur

# paths = paths[80:150]

for j in range(len(paths)):
    path = paths[j]
#     pathcoords = [(station_dict[x][0], station_dict[x][1]) for x in path]
    pathcoords = path
    
    cur_color = colors[j % len(colors)]
    
    for i in range(len(pathcoords) - 1):
        point1, point2 = pathcoords[i], pathcoords[i + 1]

#     for i in range(0, len(pathcoords) - 4, 2):
#         point1, point2 = [pathcoords[i], pathcoords[i + 1]], [pathcoords[i + 2], pathcoords[i + 3]]
        
        map_point1 = Point(m(point1[0], point1[1]))
        map_point2 = Point(m(point2[0], point2[1]))
        
        xs = [map_point1.x, map_point2.x]
        ys = [map_point1.y, map_point2.y]
        
#         m.scatter(
#             xs, ys,
#             20, marker='.', lw=.0,
#             facecolor='red', edgecolor='w',
#             alpha=0.9, antialiased=True, zorder=5)
        
        plt.plot(xs, ys, linestyle='-', color=cur_color, zorder=4)

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)

plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())

# Clustering experiements

In [None]:
routes_cluster_count = 150

### Flattening routes (e.g. for point-based clustering methods)

In [None]:
routes_coord_cur_flat = [[num for coords in route for num in coords] for route in routes_coord_cur]

### k-means clustering

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(routes_coord_cur_flat[:routes_cluster_count])
idxs = kmeans.predict(routes_coord_cur_flat[:routes_cluster_count])
idxs

### Clustering results plotting

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

dev = m.scatter(
    [geom.x for geom in list(station_points)],
    [geom.y for geom in list(station_points)],
    5, marker='.', lw=.25,
    facecolor='#33ccff', edgecolor='w',
    alpha=0.9, antialiased=True, zorder=3)

# colors = list(mpl_colors.cnames.keys())
colors = ['red', 'white', 'blue', 'yellow', 'green', 'cyan']

all_coords = list(routes_coord_cur_flat[:routes_cluster_count])
for j in range(len(all_coords)):
    pathcoords = all_coords[j]
    for i in range(0, len(pathcoords) - 4, 2):
        point1, point2 = (pathcoords[i], pathcoords[i + 1]), (pathcoords[i + 2], pathcoords[i + 3])
        map_point1 = Point(m(point1[0], point1[1]))
        map_point2 = Point(m(point2[0], point2[1]))
        xs = [map_point1.x, map_point2.x]
        ys = [map_point1.y, map_point2.y]
        
#         m.scatter(
#             xs, ys,
#             20, marker='.', lw=.0,
#             facecolor='red', edgecolor='w',
#             alpha=0.9, antialiased=True, zorder=5)
        
        plt.plot(xs, ys, linestyle='-', color=colors[(idxs[j]) % len(colors)])

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)
plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())

### Cluster centroids plotting (for k-means etc.)

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

# colors = list(mpl_colors.cnames.keys())
colors = ['red', 'white', 'blue', 'yellow', 'green', 'cyan']

dev = m.scatter(
    [geom.x for geom in list(station_points)],
    [geom.y for geom in list(station_points)],
    5, marker='.', lw=.25,
    facecolor='#33ccff', edgecolor='w',
    alpha=0.9, antialiased=True, zorder=3)

for j, pathcoords in enumerate(kmeans.cluster_centers_):
    for i in range(0, len(pathcoords) - 4, 2):
        point1, point2 = [pathcoords[i], pathcoords[i + 1]], [pathcoords[i + 2], pathcoords[i + 3]]
        map_point1 = Point(m(point1[0], point1[1]))
        map_point2 = Point(m(point2[0], point2[1]))
        xs = [map_point1.x, map_point2.x]
        ys = [map_point1.y, map_point2.y]
        plt.plot(xs, ys, linestyle='-', color=colors[j % len(colors)])

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)
plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())