In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import pickle

from itertools import chain
from itertools import product
from itertools import groupby
from operator import itemgetter

import fiona
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from mpl_toolkits.basemap import Basemap
from shapely.geometry import Point, Polygon, MultiPoint
from descartes import PolygonPatch

import matplotlib.colors as mpl_colors
from random import randint
import time

from geopy.distance import vincenty

from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Setting up environment

In [None]:
img_width_inches = 7.22 * 3
img_height_inches = 5.25 * 3

stadium_coords = 30.28725, 59.95271
def dist_to_stadium(point):
    return vincenty(stadium_coords, point).miles

In [None]:
with open('./data_routes_pickle/cell_id_lac_info', 'rb') as f:
    station_dict = pickle.load(f)
    
with open('./data_routes_pickle/routes_user_id', 'rb') as f:
    routes_user_id = pickle.load(f)

with open('./data_routes_pickle/routes_coord', 'rb') as f:
    routes_coord = pickle.load(f)

with open('./data_routes_pickle/routes_cell_id', 'rb') as f:
    routes_station = pickle.load(f)

### Reading map (really long operation)

In [None]:
m.readshapefile(
    'data_maps_input/spb',
    'city_polygons',
    drawbounds=False)

df_map = pd.DataFrame({
    'poly': [Polygon(polygon_coords) for polygon_coords in m.city_polygons]})

df_map['patches'] = df_map['poly'].map(lambda x: PolygonPatch(
    x,
    fc='black',
    ec='grey', lw=0.55, alpha=1.,
    zorder=2))

### Creating basemap, setting up scale etc.

In [None]:
shp = fiona.open('data_maps_input/spb.shp')
bds = shp.bounds
shp.close()

ll = bds[0], bds[1]
ur = bds[2], bds[3]
coords = list(chain(ll, ur))
w, h = coords[2] - coords[0], coords[3] - coords[1]
zoom_out_frac = -0.46

# new_coords_2 = coords[2] + (coords[2] - coords[0]) * 0.06

m = Basemap(
    projection='tmerc',
    lon_0=30.5,
    lat_0=60.,
    ellps='WGS84',
    llcrnrlon=coords[0] + (coords[2] - coords[0]) * 0.06 - zoom_out_frac * w,
    llcrnrlat=coords[1] - zoom_out_frac * h,
    urcrnrlon=coords[2] + (coords[2] - coords[0]) * 0.06 + zoom_out_frac * w,
    urcrnrlat=coords[3] + zoom_out_frac * h,
    lat_ts=0,
    resolution='i',
    suppress_ticks=True)

In [None]:
map_points = pd.Series(
    [Point(m(mapped_x, mapped_y)) for mapped_x, mapped_y in 
     [(station_dict[i][0], station_dict[i][1]) for i in station_dict]])

station_points = MultiPoint(list(map_points.values))

# Filtering

### Current routes resetting

In [None]:
routes_coord_cur = routes_coord
routes_user_id_cur = routes_user_id

### Filtering by total distance

In [None]:
dists_total = [0.0] * len(routes_coord)
for i in range(len(routes_coord)):
    path = routes_coord[i]
    for j in range(len(path) - 1):
        point1, point2 = path[j], path[j + 1]
        dists_total[i] += vincenty(point1, point2).miles

In [None]:
dist_total_idxs = [i for i, dist in enumerate(dists) if dist < 10]

routes_station_filtered_dist_total = [routes_station[i] for i in dist_total_idxs]

def routes_coord_filter_dist_total(routes_coord):
    return [routes_coord[i] for i in dist_total_idxs]

routes_coord_cur = routes_coord_filter_dist_total(routes_coord_cur)

### Filtering by any segment distance

In [None]:
threshold_route_any = 3

def filter_dist_any(routes_coord, routes_user_id):    
    routes_coord_filtered_dist_any_intl = []
    routes_user_id_filtered_dist_any_intl = []

    removed_path_count = 0
    for i in range(len(routes_coord)):
        path = routes_coord[i]
        user_id = routes_user_id[i]
        
        bad_path = False

        # If at least one distance is bigger than threshold, remove path
        for j in range(0, len(path) - 1):
            point1, point2 = path[j], path[j + 1]
            cur_dist = vincenty(point1, point2).miles
            if cur_dist > threshold_route_any:
                bad_path = True

        if bad_path:
            removed_path_count += 1
        else:
            routes_coord_filtered_dist_any_intl.append(path)
            routes_user_id_filtered_dist_any_intl.append(user_id)

    print("Removed " + str(removed_path_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_filtered_dist_any_intl, routes_user_id_filtered_dist_any_intl

routes_coord_cur, routes_user_id_cur = filter_dist_any(routes_coord_cur, routes_user_id_cur)

### Filtering by stadium-closest point distance

In [None]:
threshold_route_closest = 0.1

def filter_closest(routes_coord, routes_user_id):    
    routes_coord_filtered_closest_intl = []
    routes_user_id_filtered_dist_any_intl = []
    
    removed_path_count = 0
    for i in range(len(routes_coord)):
        path = routes_coord[i]
        user_id = routes_user_id[i]
        
        bad_path = True

        # At least one point is close to the stadium
        for i in range(len(path)):
            if dist_to_stadium(path[i]) < threshold_route_closest:
                bad_path = False

        if bad_path:
            removed_path_count += 1
        else:
            routes_coord_filtered_closest_intl.append(path)
            routes_user_id_filtered_dist_any_intl.append(user_id)
    
    print("Removed " + str(removed_path_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_filtered_closest_intl, routes_user_id_filtered_dist_any_intl

routes_coord_cur, routes_user_id_cur = filter_closest(routes_coord_cur, routes_user_id_cur)

### Removing cycles

In [None]:
def remove_cycles(routes_coord, routes_user_id):
    routes_coord_no_cycles_intl = []
    routes_user_id_no_cycles_intl = []
    
    removed_path_count = 0
    for i in range(len(routes_coord)):
        # Converting to tuples for groupby etc. to work
        cur_route = [tuple(coords) for coords in routes_coord[i]]
        cur_user_id = routes_user_id[i]
        
        route_unique = list(map(itemgetter(0), groupby(cur_route)))
        
        contains_cycle = False
        
        for cycle_len in range(3, len(route_unique)):
            for j in range(len(route_unique) - cycle_len):
                if (#dist_to_stadium(route_unique[j]) < 0.05 and
                    route_unique[j] == route_unique[j + cycle_len] and
                    route_unique[j] != route_unique[j + 2]):
                    contains_cycle = True
                    break
            if contains_cycle:
                break
        
        if contains_cycle:
#             print(i)
            removed_path_count += 1
        else:
            routes_coord_no_cycles_intl.append(cur_route)
            routes_user_id_no_cycles_intl.append(cur_user_id)
    
    print("Removed " + str(removed_path_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_no_cycles_intl, routes_user_id_no_cycles_intl

routes_coord_cur, routes_user_id_cur = remove_cycles(routes_coord_cur, routes_user_id_cur)

In [None]:
def remove_dups(routes_coord):
    routes_coord_no2cycles_intl = []
    
    removed_route_count = 0
    for i in range(len(routes_coord)):
#         print(*routes_coord[i], sep='\n')
#         print()
        
        route_unique = list(map(itemgetter(0), groupby(routes_coord[i])))
        
#         print(*route_unique, sep='\n')
        
        final_route = []
        
        j = 0
        while j < len(route_unique):
#             init_j = j
#             print("j = " + str(j))
            if j >= len(route_unique) - 2 or route_unique[j] != route_unique[j + 2]:
                final_route.append(route_unique[j])
                j += 1
            else:
                k = j + 1
                while k < len(route_unique):
                    if (route_unique[k] != route_unique[j] and
                        route_unique[k] != route_unique[j + 1]):
                        if route_unique[k - 1] == route_unique[j]:
                            if j == 0:
                                final_route.append(route_unique[j + 1])
                        else:
                            final_route.append(route_unique[j])
                        j = k - 1
                        break
                    k += 1
                if k == len(route_unique):
                    final_route += [route_unique[j], route_unique[j + 1]]
                    break

#             if init_j == 1:
#             print("end j = " + str(j))
#             raise Exception('Exc')
        
#         if not len(final_route) == len(route_unique):
#             print(i)
#         else:
#             print('not ' + str(i))
        
        final_route = list(map(itemgetter(0), groupby(final_route)))
        routes_coord_no2cycles_intl.append(final_route)
    return routes_coord_no2cycles_intl

routes_coord_cur = remove_dups(routes_coord_cur)

In [None]:
routes_coord_cur_late = [route[24:] for route in routes_coord_cur]

In [None]:
with open('data_routes_pickle/routes_coord_f_any_3_closest_0.1_nocycles', 'wb') as f:
    pickle.dump(routes_coord_cur, f)

In [None]:
routes_split = []
for route in routes_coord_cur:
    for i in range(len(route)):
        if dist_to_stadium(route[i]) < 0.05:
            if i == 0 or i == len(route) - 1:
                routes_split.append(route)
                break
            routes_split += [route[:i + 1], route[i:]]
#             if i == 2:
#                 print(route[:i + 1])

print(len(routes_split))
routes_coord_cur = list(filter(lambda route: len(route) > 2, routes_split))
print(len(routes_split))

# Plotting current route data

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

m.scatter(
    [geom.x for geom in list(station_points)],
    [geom.y for geom in list(station_points)],
    5, marker='.', lw=.0,
    facecolor='#33ccff', edgecolor='w',
    alpha=0.9, antialiased=True, zorder=3)

# colors = ['red', 'white', 'blue', 'yellow', 'green']
# colors = list(mpl_colors.cnames.keys())
# cur_color = colors[0]

routes_plot = routes_coord_cur[0:100]
# routes_plot = route_nodups

for j in range(len(routes_plot)):
    route_plot = routes_plot[j]
#     pathcoords = [(station_dict[x][0], station_dict[x][1]) for x in routes_plot]
    pathcoords = route_plot
    
    cur_color = colors[j % len(colors)]
    
    for i in range(0, len(pathcoords) - 1):
        point1, point2 = pathcoords[i], pathcoords[i + 1]
        
        map_point1 = Point(m(point1[0], point1[1]))
        map_point2 = Point(m(point2[0], point2[1]))
        
        xs = [map_point1.x, map_point2.x]
        ys = [map_point1.y, map_point2.y]
        
#         if i == len(pathcoords) - 2:
#         m.scatter(xs, ys, 20, marker='.', lw=.0, 
#                   facecolor='red', edgecolor='w', alpha=0.9, antialiased=True, zorder=5)

        plt.plot(xs, ys, linestyle='-', color=cur_color, zorder=4, alpha=0.5)

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)

plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())

# Data manupulation

In [None]:
station_rating_map = {}
for i in range(len(routes_coord)):
    route = routes_coord[i]
    for j in range(len(route)):
        cur_coord = tuple(route[j])
        if cur_coord not in station_rating_map:
            station_rating_map[cur_coord] = 1
        else:
            station_rating_map[cur_coord] += 1

In [None]:
most_visited_station_idxs = \
    sorted(station_rating_map, key=station_rating_map.get, reverse=True)
[station_rating_map[idx] for idx in most_visited_station_idxs[:200]]

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

ratings = []
for i in station_dict:    
    cur_station_coords = (station_dict[i][0], station_dict[i][1])
    ratings.append(station_rating_map[cur_station_coords] / 10 if cur_station_coords in station_rating_map else 0)

m.scatter(
    [geom.x for geom in list(station_points)],
    [geom.y for geom in list(station_points)],
    s=ratings, marker='.', lw=.0,
    facecolor='#33ccff', edgecolor='w',
    alpha=0.9, antialiased=True, zorder=3)

print('Plotting done')

ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)

plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())

# Clustering experiements

In [None]:
routes_cluster_count = 150

In [None]:
with open('data_routes_pickle/sim_matrix_sim_segments_1', 'rb') as f:
    sim_matrix = np.array(pickle.load(f))

In [None]:
sim_matrix

In [None]:
np.unravel_index(np.argmax(sim_matrix), sim_matrix.shape)
np.amax(sim_matrix) 

In [None]:
for i in range(len(sim_matrix1)):
    for j in range(len(sim_matrix1[0])):
        if i != j:
            sim_matrix[i][j] = 250 - sim_matrix[i][j]

In [None]:
max_points_count = 10
max_idxs_raveled = (-sim_matrix).argsort(axis=None)[:max_points_count]
max_idxs_unravled = np.unravel_index(max_idxs_raveled, sim_matrix.shape)
max_sim_route_idxs = np.array(max_idxs_unravled).T
max_sim_route_idxs
# for i in range(max_points_count):
#     cur_idxs = max_sim_route_idxs[i]
#     print(sim_matrix[cur_idxs[0], cur_idxs[1]])

### Flattening routes (e.g. for point-based clustering methods)

In [None]:
routes_coord_cur_flat = [[num for coords in route for num in coords] for route in routes_coord_cur]

### k-means clustering

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
# kmeans.fit(routes_coord_cur_flat[:routes_cluster_count])
# idxs = kmeans.predict(routes_coord_cur_flat[:routes_cluster_count])
idxs = kmeans.fit_predict(sim_matrix)
idxs

In [None]:
for cluster_count in range(2, 15):
    kmeans = KMeans(n_clusters=cluster_count, random_state=42)
    
    # For routes of the same length
#     kmeans.fit(routes_coord_cur_flat)
#     idxs = kmeans.predict(routes_coord_cur_flat)

    idxs = kmeans.fit_predict(sim_matrix)
    print(silhouette_score(sim_matrix, idxs, metric='precomputed'))

### Clustering results plotting

In [None]:
for cluster_num in range(num_clusters_cur):
    # plt.clf()
    fig = plt.figure(facecolor='black')
    ax = fig.add_subplot(111, frame_on=False)

    dev = m.scatter(
        [geom.x for geom in list(station_points)],
        [geom.y for geom in list(station_points)],
        5, marker='.', lw=.25,
        facecolor='#33ccff', edgecolor='w',
        alpha=0.9, antialiased=True, zorder=3)

    # from random import shuffle
    # colors = list(mpl_colors.cnames.keys())
    # shuffle(colors)
    colors = ['red', 'white', 'blue', 'yellow', 'green', 'cyan', 'purple']

    all_coords = list(routes_coord_cur)
    for j in range(len(all_coords)):
        pathcoords = all_coords[j]

        for i in range(0, len(pathcoords) - 1):
            if idxs[j] == cluster_num:
                point1, point2 = pathcoords[i], pathcoords[i + 1]
                map_point1 = Point(m(point1[0], point1[1]))
                map_point2 = Point(m(point2[0], point2[1]))
                xs = [map_point1.x, map_point2.x]
                ys = [map_point1.y, map_point2.y]

    #             m.scatter(xs, ys, 20, marker='.', lw=.0,
    #                       facecolor='red', edgecolor='w', alpha=0.9, antialiased=True, zorder=5)

                plt.plot(xs, ys, linestyle='-', color=colors[idxs[j] % len(colors)])
    #             plt.plot(xs, ys, linestyle='-', color=colors[j % len(colors)])

    #             max_sim_route_idx = max_sim_route_idxs[4]
    #             if j == 90: # max_sim_route_idx[0]:
    #                 plt.plot(xs, ys, linestyle='-', color='red', alpha=0.5)
    #             elif j == 97: # max_sim_route_idx[1]:
    #                 plt.plot(xs, ys, linestyle='-', color='blue', alpha=0.5)

    print('Plotting done')

    # ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

    fig.set_size_inches(img_width_inches, img_height_inches)
    plt.savefig(
        "data_maps_output/routes {}.png".format(int(time.time())),
        dpi=200, alpha=True, facecolor=fig.get_facecolor())

### Cluster centroids plotting (for k-means etc.)

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

colors = ['red', 'white', 'blue', 'yellow', 'green', 'cyan', 'purple']
# from random import shuffle
# colors = list(mpl_colors.cnames.keys())
# shuffle(colors)

m.scatter(
    [geom.x for geom in list(station_points)],
    [geom.y for geom in list(station_points)],
    5, marker='.', lw=.25,
    facecolor='#33ccff', edgecolor='w',
    alpha=0.9, antialiased=True, zorder=3)

centroids = np.array([[0.0] * len(routes_coord_cur_flat[0])] * len(kmeans.cluster_centers_))

routes_per_cluster = [0] * len(kmeans.cluster_centers_)
for j, pathcoords in enumerate(routes_coord_cur_flat[:100]):
    centroids[idxs[j]] += pathcoords
    routes_per_cluster[idxs[j]] += 1

# centroids = list(filter(lambda x: len(x) >= 1, centroids))
    
centroids = \
    [centroid / (routes_per_cluster[i] if routes_per_cluster[i] != 0 else -1) 
     for i, centroid in enumerate(centroids)]

for j in range(len(centroids)):
    centroid = centroids[j]
    for i in range(0, len(centroid) - 4, 2):
        point1, point2 = [centroid[i], centroid[i + 1]], [centroid[i + 2], centroid[i + 3]]
        map_point1 = Point(m(point1[0], point1[1]))
        map_point2 = Point(m(point2[0], point2[1]))
        xs = [map_point1.x, map_point2.x]
        ys = [map_point1.y, map_point2.y]
        plt.plot(xs, ys, linestyle='-', color=colors[j % len(colors)])

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)
plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())