<a href="https://colab.research.google.com/github/jfkback/scooter-flow-prediction-clustering/blob/master/basic_flow_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import folium
import colorsys
import math

import matplotlib as plt
import seaborn as sns
import pandas as pd
import numpy as np
import branca.colormap as cm

from google.colab import files
from sklearn.cluster import DBSCAN
from google.colab import drive
from folium import plugins
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime, TimestampedGeoJson
from folium.features import ColorLine
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

In [0]:
drive.mount('/content/gdrive', force_remount=True)

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/processed_scooter_data/DC-6/DC-6_14-trip-vector.csv')

In [0]:
df_metro = pd.read_csv('/content/gdrive/My Drive/processed_scooter_data/Metro_Station_Entrances_in_DC.csv')

In [0]:
df_bus = pd.read_csv('/content/gdrive/My Drive/processed_scooter_data/Metro_Bus_Stops.csv')

In [0]:
df2 = df

In [0]:
def get_X(df):
  start_lat = df['start_lat'].values
  start_lat = (start_lat.max() - start_lat) / (start_lat.max() - start_lat.min())

  start_lon = df['start_lon'].values
  start_lon = (start_lon.max() - start_lon) / (start_lon.max() - start_lon.min())

  end_lat = df['end_lat'].values
  end_lat = (end_lat.max() - end_lat) / (end_lat.max() - end_lat.min())

  end_lon = df['end_lon'].values
  end_lon = (end_lon.max() - end_lon) / (end_lon.max() - end_lon.min())

  vector_x = df['vector_x'].values
  vector_y = df['vector_y'].values

  dot = []
  for vx, vy in zip(vector_x, vector_y):
    vector = np.array([vx, vy])
    vector = vector / np.linalg.norm(vector)
    dot.append((1 + np.dot(vector, np.array([1, 0]))) / 2)

  return list(zip(vector_x.tolist(), vector_y.tolist(), 
               start_lat.tolist(), start_lon.tolist(),
               dot, end_lat.tolist(), end_lon.tolist()))

Does trajectory clustering

In [0]:
df_clustered = pd.DataFrame()
df_unclustered = df2
num_batch = 0

while len(df_clustered)/len(df2) < .6:
  X = get_X(df_unclustered)
  eps = .005 + (num_batch) * .002
  clustering = DBSCAN(eps=eps, min_samples=3).fit(X)
  labels = clustering.labels_
  print(len(df_unclustered), len(X))
  df_unclustered['labels'] = labels
  df_labeled = df_unclustered[df_unclustered['labels'] != -1]
  df_labeled['cluster_batch'] = num_batch
  df_clustered = df_clustered.append(df_labeled)
  df_unclustered = df_unclustered[df_unclustered['labels'] == -1]
  num_batch += 1

In [0]:
linear_rainbow = cm.LinearColormap(
    colors=['violet', 'indigo', 'blue', 'green', 'yellow', 'orange', 'red'],
    vmin=0, vmax=1
)

step_rainbow = linear_rainbow.to_step(50)

In [0]:
#@title Mapper { display-mode: "form" }
class Mapper:
  def __init__(self, lat=38.9072, lon=-77.0369, tile_type='cartodbdark_matter', zoom=13):
    self.map = folium.Map(
      location=[lat, lon],
      tiles=tile_type,
      zoom_start=zoom  
    )
    
  def get_map(self):
    return self.map
  
  def set_map(self, new_map):
    self.map = new_map
  
  def add_heatmap_time(self, data):
    HeatMapWithTime(
      data=data,
    ).add_to(self.map)
    
  def add_point(self, coord, color='white', radius=10, opacity=1):
    folium.Circle(
      location=(coord[0], coord[1]),
      color=color,
      radius=radius,
      fill=True,
      opacity=opacity,
      fill_opacity=opacity,
    ).add_to(self.map)
    
  def _normalize_hue(self, hue, curr):
    max_hue = max(hue)
    min_hue = min(hue)
    if max_hue == min_hue:
      return 1
    return ((max_hue - curr) / (max_hue - min_hue))
    
  def add_points(self, coords, color='white', cmap=None, hue=None, 
                 radius_list=None, radius=10, opacity=1):
    for i, coord in enumerate(coords):
      radius = radius
      if cmap is not None and hue is not None:
        color = cmap(self._normalize_hue(hue, hue[i]))
      if radius_list is not None:
        radius = (radius_list[i] - min(radius_list)) / (max(radius_list) - min(radius_list))
#         radius = radius_list[i]
        radius *= 120
      self.add_point(coord, color, radius, opacity)
    
  def save_map(self, file_name='map'):
    self.map.save(file_name + '.html')
    files.download(file_name + '.html')
    
  def add_geojson(self, features):
    folium.GeoJson({
      'type': 'FeatureCollection',
      'features': features
    }).add_to(self.map)
    
  def add_geojson_no_feature(self, json):
    folium.GeoJson(
        data=json
    ).add_to(self.map)
    
  def add_line(self, points, color='white', opacity=1, weight=2, tooltip=None):
    folium.PolyLine(
      points,
      color=color,
      weight=weight,
      opacity=opacity,
      tooltip=tooltip,
    ).add_to(self.map)
    
  def add_lines(self, list_points, color='white', opacity=1, weight=2, tooltip=""):
    for points in list_points:
      self.add_line(list(points), color, opacity, weight, tooltip)
  
  def _make_geojson_polygon(self, coordinates, color='white', key=None):
    return {
      'type': 'Feature',
      'properties': {
          'style': {
              'color': color
          },
          'key': key
      },
      'geometry': {
        'type': 'Polygon',
        'coordinates': [
          coordinates
        ]
      }
    }
  
  def _make_geojson_polygons(self, list_of_coordinates, color='white', keys=None):
    features = []
    for x, coordinates in enumerate(list_of_coordinates):
      key = ''
      if keys:
        key = keys[x]
      features.append(self._make_geojson_polygon(coordinates, color, key))
    return {
      'type': 'FeatureCollection',
      'features': features
    }
  
  def add_polygon(self, coordinates, color='white'):
    self.add_geojson([self._make_geojson_polygon(coordinates, color)])
  
  def add_polygons(self, list_of_coordinates, color='white'):
    geojson = self._make_geojson_polygons(list_of_coordinates, color)
    self.add_geojson_no_feature(geojson)
    
  def add_flow_lines(self, list_points, list_weights=None, color='white', 
                     opacity=.4, hue=None, cmap=None):

    for x, points in enumerate(list_points):
      weight = 2
      tooltip = None
      if cmap is not None and hue is not None:
        color = cmap(self._normalize_hue(hue, hue[x]))
      if list_weights is not None:
        weight = list_weights[x] * .5
        tooltip = hue[x]
      self.add_line(points, color, weight=weight, 
                    opacity=opacity, tooltip=tooltip)
      
    
  def add_choropath(self, geo_data, data, columns, key_on, color='OrRd',
    bins=6):
    folium.Choropleth(
      geo_data=geo_data,
      data=data,
      columns=columns,
      key_on=key_on,
      bins=bins,
      fill_color=color
    ).add_to(self.map)
    
  def add_choropath_coords(self, coords, data, columns, key_on, 
    fill_color='OrRd', color='white', keys=None, bins=6):
    geo_data = self._make_geojson_polygons(coords, color, keys)
    self.add_choropath(geo_data, data, columns, key_on, fill_color, bins)

In [0]:
df_bus_small = df_bus[(df_bus['X'] > -77.054437) & (df_bus['X'] < -76.759645)]
df_bus_small = df_bus_small[(df_bus_small['Y'] > 38.762328) & (df_bus_small['Y'] < 39.186582)]

10347

In [0]:
temp = df_clustered[(df_clustered['cluster_batch'] == 0) & (df_clustered['labels'] == 3)]
s_points = list(zip(temp['start_lat'], temp['start_lon']))
e_points = list(zip(temp['end_lat'], temp['end_lon']))
c = list(zip(s_points, e_points))
df_temp_sorted = df_temp.sort_values('trip_amount', ascending=False)[:5]

m = Mapper(tile_type='cartodbpositron')
m.add_point((df_temp_sorted['start_lat'], df_temp_sorted['start_lon']))
m.add_lines(c, color='red')
m.get_map()

In [0]:
df_metro = df_metro[~df_metro.NAME.str.contains('ELEV')]

In [0]:
df3 = df_clustered[df_clustered['labels'] != -1]
df3 = df3.reset_index()
groupby = df3.groupby(['cluster_batch', 'labels', 'hour'])

df_temp = pd.DataFrame()
df_temp['start_lat'] = groupby['start_lat'].mean()
df_temp['start_lon'] = groupby['start_lon'].mean()
df_temp['end_lat'] = groupby['end_lat'].mean()
df_temp['end_lon'] = groupby['end_lon'].mean()
df_temp['start_lat_std'] = groupby['start_lat'].std()
df_temp['start_lon_std'] = groupby['start_lon'].std()
df_temp['end_std'] = (groupby['end_lat'].std() + groupby['end_lon'].std()) / 2
df_temp['trip_amount'] = groupby['bike_id'].count().astype(int)
df_temp['cluster_batch'] = df_temp.index.codes[0]
df_temp['hour'] = df_temp.index.codes[2]

df_temp = df_temp[(df_temp['hour'] > 6) & (df_temp['hour'] < 20)]
df_temp = df_temp.sort_values('trip_amount', ascending=False)
df_temp = df_temp[:200]

coords = list(zip(list(zip(df_temp['start_lat'], df_temp['start_lon'])), list(zip(df_temp['end_lat'], df_temp['end_lon']))))
hue = (groupby['hour'].std()).values.tolist()
hue = df_temp['hour'].values.astype(int).tolist()
m = Mapper(tile_type='cartodbpositron')
m.add_flow_lines(coords, hue=hue, cmap=step_rainbow, 
                 list_weights=df_temp['trip_amount'].values.astype(int).tolist(),
                 opacity=.3)
m.add_points(list(zip(df_metro['Y'], df_metro['X'])), color='black', opacity=.5)
m.add_points(list(zip(df_temp['end_lat'], df_temp['end_lon'])), 
             radius_list=df_temp['end_std'].values.tolist(), opacity=.5, 
             hue=hue, cmap=step_rainbow)
m.get_map()

In [0]:
m.save_map()

In [0]:
df3 = df_clustered[df_clustered['labels'] != -1]
df3 = df3[(df3['labels'] != -1) & (df3['cluster_batch'] == 0)]
df3 = df3.reset_index()

coords = list(zip(list(zip(df3['start_lat'], df3['start_lon'])), list(zip(df3['end_lat'], df3['end_lon']))))

hue = df3['labels'].values.tolist()
m = Mapper()
m.add_points(list(zip(df3['end_lat'], df3['end_lon'])), opacity=.5)
m.add_flow_lines(coords, hue=hue, cmap=step_rainbow, opacity=.5)
m.get_map()