In [None]:
%matplotlib inline

In [None]:
import os
import pickle
from math import floor
from random import shuffle
from statistics import variance
import math

from itertools import chain
from itertools import product
from itertools import groupby
from operator import itemgetter

import pandas as pd
import numpy as np

import fiona
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from mpl_toolkits.basemap import Basemap
from shapely.geometry import Point, Polygon, MultiPoint
from descartes import PolygonPatch

import matplotlib.colors as mpl_colors
from random import randint
import time

from geopy.distance import vincenty

# Setting up environment

In [None]:
img_width_inches = 7.22 * 3
img_height_inches = 5.25 * 3

stadium_coords = 30.28725, 59.95271
def dist_to_stadium(point):
    return vincenty(stadium_coords, point).miles

def remove_dups_conseq(route):
    return list(map(itemgetter(0), groupby(route)))

In [None]:
with open('./data_routes_pickle/cell_id_lac_info', 'rb') as f:
    station_dict = pickle.load(f)
    
with open('./data_routes_pickle/routes_user_id', 'rb') as f:
    routes_user_id = pickle.load(f)

with open('./data_routes_pickle/routes_coord', 'rb') as f:
    routes_coord = pickle.load(f)

with open('./data_routes_pickle/routes_cell_id', 'rb') as f:
    routes_station = pickle.load(f)

### Reading map (really long operation)

In [None]:
m.readshapefile(
    'data_maps_input/spb',
    'city_polygons',
    drawbounds=False)

df_map = pd.DataFrame({
    'poly': [Polygon(polygon_coords) for polygon_coords in m.city_polygons]})

In [None]:
df_map['patches'] = df_map['poly'].map(lambda x: PolygonPatch(
    x,
    fc='black',
    ec='grey', lw=0.55, alpha=0.6,
    zorder=2))

### Creating basemap, setting up scale etc.

In [None]:
shp = fiona.open('data_maps_input/spb.shp')
bds = shp.bounds
shp.close()

ll = bds[0], bds[1]
ur = bds[2], bds[3]
coords = list(chain(ll, ur))
w, h = coords[2] - coords[0], coords[3] - coords[1]
zoom_out_frac = -0.3

m = Basemap(
    projection='tmerc',
    lon_0=30.5,
    lat_0=60.,
    ellps='WGS84',
    llcrnrlon=coords[0] + (coords[2] - coords[0]) * 0.06 - zoom_out_frac * w,
    llcrnrlat=coords[1] - zoom_out_frac * h,
    urcrnrlon=coords[2] + (coords[2] - coords[0]) * 0.06 + zoom_out_frac * w,
    urcrnrlat=coords[3] + zoom_out_frac * h,
    lat_ts=0,
    resolution='i',
    suppress_ticks=True)

In [None]:
map_points = pd.Series(
    [Point(m(mapped_x, mapped_y)) for mapped_x, mapped_y in 
     [(station_dict[i][0], station_dict[i][1]) for i in station_dict]])

station_points = MultiPoint(list(map_points.values))

# Filtering

### Current routes resetting

In [None]:
routes_coord_cur = routes_coord
routes_user_id_cur = routes_user_id

### Filtering by any segment distance

In [None]:
threshold_route_any = 5

def filter_dist_any(routes_coord, routes_user_id):    
    routes_coord_filtered_dist_any_intl = []
    routes_user_id_filtered_dist_any_intl = []

    removed_route_count = 0
    for i in range(len(routes_coord)):
        route = routes_coord[i]
        user_id = routes_user_id[i]
        
        is_long = False
        # All (or not all :) the distances should <= the threshold
        for j in range(len(route) - 1):
#         for j in range(min(2, len(route) - 1)):
            point1, point2 = route[j], route[j + 1]
            cur_dist = vincenty(point1, point2).miles
            if cur_dist > threshold_route_any:
                is_long = True

        if is_long:
            removed_route_count += 1
        else:
            routes_coord_filtered_dist_any_intl.append(route)
            routes_user_id_filtered_dist_any_intl.append(user_id)

    print("Removed " + str(removed_route_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_filtered_dist_any_intl, routes_user_id_filtered_dist_any_intl

routes_coord_cur, routes_user_id_cur = filter_dist_any(routes_coord_cur, routes_user_id_cur)

### Filtering by stadium-closest point distance

In [None]:
threshold_route_closest = 0.05

def filter_closest(routes_coord, routes_user_id):    
    routes_coord_filtered_closest_intl = []
    routes_user_id_filtered_dist_any_intl = []
    
    removed_route_count = 0
    for i in range(len(routes_coord)):
        route = routes_coord[i]
        user_id = routes_user_id[i]
        
        is_far = True
        # At least one point should be close to the stadium
        for i in range(len(route)):
            if dist_to_stadium(route[i]) < threshold_route_closest:
                is_far = False

        if is_far:
            removed_route_count += 1
        else:
            routes_coord_filtered_closest_intl.append(route)
            routes_user_id_filtered_dist_any_intl.append(user_id)
    
    print("Removed " + str(removed_route_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_filtered_closest_intl, routes_user_id_filtered_dist_any_intl

routes_coord_cur, routes_user_id_cur = filter_closest(routes_coord_cur, routes_user_id_cur)

### Removing cycles (w/ length >= 3) 

In [None]:
def remove_cycles(routes_coord, routes_user_id):
    routes_coord_no_cycles_intl = []
    routes_user_id_no_cycles_intl = []
    
    removed_route_count = 0
    for i in range(len(routes_coord)):
        # Converting to tuples for groupby etc. to work
        cur_route = [tuple(coords) for coords in routes_coord[i]]
        cur_user_id = routes_user_id[i]
        
        route_unique = remove_dups_conseq(cur_route)
        
        contains_cycle = False
        
        for cycle_len in range(3, len(route_unique)):
            for j in range(len(route_unique) - cycle_len):
                if (route_unique[j] == route_unique[j + cycle_len] and
                    route_unique[j] != route_unique[j + 2]):
                    contains_cycle = True
                    break
            if contains_cycle:
                break
        
        if contains_cycle:
#             print(i)
            removed_route_count += 1
        else:
            routes_coord_no_cycles_intl.append(cur_route)
            routes_user_id_no_cycles_intl.append(cur_user_id)
    
    print("Removed " + str(removed_route_count) + " routes out of " + str(len(routes_coord)))
    
    return routes_coord_no_cycles_intl, routes_user_id_no_cycles_intl

routes_coord_cur, routes_user_id_cur = remove_cycles(routes_coord_cur, routes_user_id_cur)

### Cutting all the dups

In [None]:
def cut_dups(routes_coord):
    """
    Cutting all the dups (even w/ length = 2)
    TODO Consider splitting such 2-length dups (offshoots)
    """
    
    routes_coord_no2cycles_intl = []
    
    removed_route_count = 0
    for i in range(len(routes_coord)):
        route_unique = remove_dups_conseq(routes_coord[i])
        
        final_route = []
        
        j = 0
        while j < len(route_unique):
            if j >= len(route_unique) - 2 or route_unique[j] != route_unique[j + 2]:
                final_route.append(route_unique[j])
                j += 1
            else:
                k = j + 1
                while k < len(route_unique):
                    if (route_unique[k] != route_unique[j] and
                        route_unique[k] != route_unique[j + 1]):
                        if route_unique[k - 1] == route_unique[j]:
                            if j == 0:
                                final_route.append(route_unique[j + 1])
                        else:
                            final_route.append(route_unique[j])
                        j = k - 1
                        break
                    k += 1
                if k == len(route_unique):
                    final_route += [route_unique[j], route_unique[j + 1]]
                    break
        
        final_route = remove_dups_conseq(final_route)
        routes_coord_no2cycles_intl.append(final_route)
    return routes_coord_no2cycles_intl

print(len(routes_coord_cur))
routes_coord_cur = cut_dups(routes_coord_cur)
print(len(routes_coord_cur))

### Splitting routes on to/from the stadium (optional length filtering)

In [None]:
routes_split = []
for route in routes_coord_cur:
    for i in range(len(route)):
        if dist_to_stadium(route[i]) < 0.05:
            if i == 0:
                routes_split.append(route)
                break
            elif i == len(route) - 1:
                routes_split.append(route[::-1])
                break
            routes_split += [route[:i + 1][::-1], route[i:]]

print(len(routes_coord_cur))
routes_coord_cur = list(filter(lambda route: len(route) > 2, routes_split))
print(len(routes_coord_cur))

### Route angle determining

In [None]:
def vector_diff(point1, point2):
    return [point1.x - point2.x, point1.y - point2.y]

def angle_diff_points(point0, point1, point2):
    map_point0, map_point1, map_point2 = \
        Point(m(point0[0], point0[1])), \
        Point(m(point1[0], point1[1])), \
        Point(m(point2[0], point2[1])),
    
    vec1 = vector_diff(map_point1, map_point0)
    vec2 = vector_diff(map_point2, map_point0)
    prod = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    return math.acos(prod)

def angle_diff_max(route):
    angle_diff_max = 0
    for i in range(1, len(route) - 1):
        for j in range(i + 1, len(route)):
            angle_diff_cur = angle_diff_points(route[0], route[i], route[j])
            print(angle_diff_cur)
            if angle_diff_cur > angle_diff_max:
                angle_diff_max = angle_diff_cur

    return angle_diff_max

In [None]:
def cumul_point(route):
    cumul_point = [0, 0]
    for i, point in enumerate(route[1:]):
        cumul_point[0] += point[0]
        cumul_point[1] += point[1]

    z = len(route) - 1
    return [cumul_point[0] / z, cumul_point[1] / z]

### Filtering on max angle between points of the route start

In [None]:
print(len(routes_coord_cur))
routes_coord_cur = list(filter(lambda route: angle_diff_max(route) < 1.2, routes_coord_cur))
print(len(routes_coord_cur))

### Route validity checking

In [None]:
for i in range(len(routes_coord_cur)):
    route = routes_coord_cur[i]
    stad_points_count = 0
    stad_point_idx = -1
    for j in range(len(route)):
        if dist_to_stadium(route[j]) < 0.05:
            stad_points_count += 1
            stad_point_idx = j
    if stad_points_count != 1 or stad_point_idx != 0:
        print(stad_point_idx)

### Backup utility cells

In [None]:
routes_coord_cur_back = routes_coord_cur[:]
routes_user_id_cur_back = routes_user_id_cur[:]

In [None]:
routes_coord_cur = routes_coord_cur_back
routes_user_id_cur = routes_user_id_cur_back

routes_coord_cur_back = routes_coord_cur[:]
routes_user_id_cur_back = routes_user_id_cur[:]

### Saving final filtered/splitted routes

In [None]:
with open('data_routes_pickle/routes_coord_any_3_contains_nocycles_angle_1.2_1k', 'wb') as f:
    pickle.dump(routes_coord_cur, f)
    
with open('data_routes_pickle/routes_coord_cur', 'wb') as f:
    pickle.dump(routes_coord_cur, f)

In [None]:
len(routes_coord_cur)

# Plotting current route data

In [None]:
plt.clf()
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

m.scatter([geom.x for geom in list(station_points)], [geom.y for geom in list(station_points)],
          5, marker='.', lw=.25, facecolor='#33ccff', edgecolor='w', alpha=0.9, antialiased=True, zorder=3)

# colors = ['red', 'white', 'blue', 'yellow', 'green']
colors = list(mpl_colors.cnames.keys())
cur_color = colors[0]

for j in range(len(routes_coord_cur[:10])):
    route_coords = routes_coord_cur[j]
    for i in range(0, len(route_coords) - 1):
        point1, point2 = route_coords[i], route_coords[i + 1]
        map_point1, map_point2 = Point(m(point1[0], point1[1])), Point(m(point2[0], point2[1]))
        xs, ys = [map_point1.x, map_point2.x], [map_point1.y, map_point2.y]
        alpha = (i + 1) / len(route_coords)

#         m.scatter(xs, ys, 20, marker='.', lw=.0,
#                   facecolor='red', edgecolor='w', alpha=0.9, antialiased=True, zorder=5)

        plt.plot(xs, ys, linestyle='-', color=colors[j % len(colors)], zorder=4, alpha=1)

    cumul_p = cumul_point(route_coords)
    map_point_0 = Point(m(route_coords[0][0], route_coords[0][1]))
    map_point_1 = Point(m(cumul_p[0], cumul_p[1]))
    plt.plot([map_point_0.x, map_point_1.x], 
             [map_point_0.y, map_point_1.y], linestyle='-', color='b', zorder=4, alpha=1)

#     print(j)

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)

plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())

# Data manupulation

### Station popularity computing & plotting

In [None]:
station_rating_map = {}
for i in range(len(routes_coord)):
    route = routes_coord[i]
    for j in range(len(route)):
        cur_coord = tuple(route[j])
        if cur_coord not in station_rating_map:
            station_rating_map[cur_coord] = 1
        else:
            station_rating_map[cur_coord] += 1

most_visited_station_idxs = \
    sorted(station_rating_map, key=station_rating_map.get, reverse=True)

In [None]:
fig = plt.figure(facecolor='black')
ax = fig.add_subplot(111, frame_on=False)

ratings = []
for i in station_dict:    
    cur_station_coords = station_dict[i][0], station_dict[i][1]
    ratings.append(station_rating_map[cur_station_coords] / 10 if cur_station_coords in station_rating_map else 0)

m.scatter([geom.x for geom in list(station_points)], [geom.y for geom in list(station_points)],
          ratings, marker='.', lw=.25, facecolor='#33ccff', edgecolor='w', alpha=0.9, antialiased=True, zorder=3)

print('Plotting done')

# ax.add_collection(PatchCollection(df_map['patches'].values, match_original=True))

fig.set_size_inches(img_width_inches, img_height_inches)

plt.savefig(
    "data_maps_output/routes {}.png".format(int(time.time())),
    dpi=200, alpha=True, facecolor=fig.get_facecolor())