In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import folium
import folium.plugins as plugins
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import geopy.distance

In [2]:
# Show more rows when printing lists
pd.options.display.max_rows = 50

## Methods

### Load file

In [3]:
#get DateTime on the correct format (in accordance with excel file)
def load_file(file_name):
    df = pd.read_csv(file_name)
    
    # change date format
    df['DateTime'] = pd.TimedeltaIndex(df['DateTime'], unit='d') + dt.datetime(1899,12,30)
    
    # reformat coordinates to dot-separated floats
    df['Long'] = df['Long'].apply(lambda x: float(str(x).replace(',','.')))
    df['Lat'] = df['Lat'].apply(lambda x: float(str(x).replace(',','.')))
    
    # remove outliers
    #df = df[(df['Long'] <= 11) & (df['Long'] >= 10.62)]
    # only keep trip if it has a start and an end
    df = df[df.duplicated('Trip ID', keep=False)]
    return df

### Remove incomplete trips

In [4]:
def remove_incomplete_trips(df):
    df = df[df.duplicated('Trip ID', keep=False)]
    return df

### Get start and end data

In [5]:
def get_start_data(df):
    return df.loc[df['TripStage'] == 'Start']

def get_end_data(df):
    return df.loc[df['TripStage'] == 'End']

### Scatter plot

In [37]:
def plot_scatter(df, n, m):
    
    # mark each station as a point
    for index, row in df.sample(n=n).iterrows():
        folium.CircleMarker([row['Lat'], row['Long']],
                            radius=3,
                            popup=str(row),
                            fill = True,
                            color= "#3db7e4" if row['TripStage'] == "Start" else "#e43d43", # divvy color
                           ).add_to(m)
    return m

### Grid creator (returns polygons as geojson objects in a list)

In [13]:
#https://www.jpytr.com/post/analysinggeographicdatawithfolium/
def get_geojson_grid(upper_right, lower_left, lat_dim=8, lon_dim=15):
    """Returns a grid of geojson rectangles, and computes the exposure in each section of the grid based on the vessel data.

    Parameters
    ----------
    upper_right: array_like
        The upper right hand corner of "grid of grids" (the default is the upper right hand [lat, lon] of the USA).

    lower_left: array_like
        The lower left hand corner of "grid of grids"  (the default is the lower left hand [lat, lon] of the USA).

    n: integer
        The number of rows/columns in the (n,n) grid.

    Returns
    -------

    list
        List of "geojson style" dictionary objects   
    """

    all_boxes = []

    lat_steps = np.linspace(lower_left[0], upper_right[0], lat_dim+1)
    lon_steps = np.linspace(lower_left[1], upper_right[1], lon_dim+1)

    lat_stride = lat_steps[1] - lat_steps[0]
    lon_stride = lon_steps[1] - lon_steps[0]
    
    zone_counter = 1

    for lat in lat_steps[:-1]:
        for lon in lon_steps[:-1]:
           
            
            # Define dimensions of box in grid
            upper_left = [lon, lat + lat_stride]
            upper_right = [lon + lon_stride, lat + lat_stride]
            lower_right = [lon + lon_stride, lat]
            lower_left = [lon, lat]
                

            # Define json coordinates for polygon
            coordinates = [
                upper_left,
                upper_right,
                lower_right,
                lower_left,
                upper_left
            ]

            geo_json = {"type": "FeatureCollection",
                        "properties":{
                            "lower_left": lower_left,
                            "upper_right": upper_right,
                        },
                        "features":[],
                        "Number": zone_counter}


            grid_feature = {
                "type":"Feature",
                "geometry":{
                    "type":"Polygon",
                    "coordinates": [coordinates],
                }
            }

            geo_json["features"].append(grid_feature)

            all_boxes.append(geo_json)
            
            zone_counter+=1

    return all_boxes

### Plot n samples on the predefined grid

In [52]:
def plot_samples_on_grid(dataframe, n, excluded_zones):
    
    lower_left = [59.855331, 10.601628]
    upper_right = [59.973287, 10.950989]
    
    
    center = [lower_left[0]+(upper_right[0]-lower_left[0])/2,lower_left[1]+(upper_right[1]-lower_left[1])/2]

    #grid_size = int(np.sqrt(len(world.pNodes)))
    grid_size = (26,40)
    
    map = folium.Map(center, zoom_start = 10.8)
    

    grid = get_geojson_grid(lower_left, upper_right , lat_dim=grid_size[0], lon_dim=grid_size[1])
    
    count = 1
    for i, geo_json in enumerate(grid):
        if (str(len(grid)-i) not in excluded_zones):
            gj = folium.GeoJson(geo_json,
                    style_function=lambda feature: {#'fillColor': color,
                                                    'color':'#df80e8',
                                                    'weight': 2,
                                                    'dashArray': '5, 5',
                                                    'fillOpacity': 0.1}).add_to(map)
            popup_string = "Node "+str(count)
            count += 1
            gj.add_child(folium.Popup(popup_string, max_width=400))

    map = plot_scatter(dataframe, n, map)      
            
    display(map)

## Assign zone to each row, remove trips outside zones

In [9]:
file_name = "./data/2019_vy.csv"

#Size of zone area
lower_left = [59.855331, 10.601628]
upper_right = [59.973287, 10.950989]
center = [lower_left[0]+(upper_right[0]-lower_left[0])/2,lower_left[1]+(upper_right[1]-lower_left[1])/2]

data = load_file(file_name)
data_copy = data.copy()

#Start data and end data
start_data = get_start_data(data_copy)
end_data = get_end_data(data_copy)

excluded_zones = []

#List of polygons in grid
zones = get_geojson_grid(upper_right, lower_left, lat_dim=26, lon_dim=40)

# For each zone, mark all trips in that zone with the zone number
for zone in zones:
    upper_left = zone['features'][0]['geometry']['coordinates'][0][0]    
    upper_right = zone['features'][0]['geometry']['coordinates'][0][1]    
    lower_right = zone['features'][0]['geometry']['coordinates'][0][2]    
    lower_left = zone['features'][0]['geometry']['coordinates'][0][3]
    
    
    data_copy.loc[
        (data_copy.Long < float(upper_right[0]))
        & (data_copy.Long > float(upper_left[0]))
        & (data_copy.Lat > float(lower_left[1]))
        & (data_copy.Lat < float(upper_left[1])),'Zone'] = str(zone['Number']).strip()

    
    # Calculate new field "No. trips zone", which indicates how many starts and ends that have happened in that zone 
    data_copy.loc[
        data_copy.Zone == str(zone['Number']).strip(), 'No. trips zone'] = len(data_copy.loc[data_copy.Zone == str(zone['Number']).strip()])
    
    # add unused zones to excluded zones
    if (len(data_copy.loc[data_copy.Zone == str(zone['Number']).strip()]) <= 150):
        excluded_zones.append(str(zone['Number']).strip())
        
        
# remove starts and ends that happened outside of the defined zones       
data_copy = data_copy[~data_copy['Zone'].isin(excluded_zones)]
        
# remove incomplete trips, so that the the system becomes "closed", meaning that all starts and ends happen within the defined area
data_copy = remove_incomplete_trips(data_copy)
        

In [15]:
data_copy.head(500)

Unnamed: 0,DateTime,Trip ID,TripStage,Long,Lat,Car ID,Trip duration (min),Zone,No. trips zone
0,2018-12-31 11:39:21.888,1,Start,10.73382,59.92934,V5a15A3226E5F7,1403.350000,656,2482.0
1,2019-01-01 11:02:43.008,1,End,10.74900,59.91063,V5a15A3226E5F7,1403.350000,497,6666.0
2,2019-01-01 08:18:08.352,2,Start,10.73561,59.93056,V5a15A3226E89D,24.716667,656,2482.0
3,2019-01-01 08:42:50.976,2,End,10.80917,59.94132,V5a15A3226E89D,24.716667,744,999.0
4,2019-01-01 14:34:39.360,3,Start,10.72983,59.92513,V5a15A3226E999,1067.866667,615,3072.0
...,...,...,...,...,...,...,...,...,...
505,2019-01-06 08:56:43.872,253,End,10.75123,59.92292,V5a15A3226DFD7,9.066667,578,6889.0
506,2019-01-06 08:51:35.424,254,Start,10.75138,59.92283,V5a15A3226DF81,244.166667,578,6889.0
507,2019-01-06 12:55:45.408,254,End,10.71548,59.91042,V5a15A3226DF81,244.166667,494,5977.0
508,2019-01-06 08:54:24.768,255,Start,10.73688,59.92248,V5a15A3226E58B,43.083333,576,4316.0


In [16]:
print("Nr of excluded zones: " + str(len(excluded_zones)))
print("Nr of included zones: " + str(str(1040-len(excluded_zones))))

Nr of excluded zones: 786
Nr of included zones: 254


In [28]:
#nr of cars
print("Nr of cars in the system: " + str(data_copy['Car ID'].nunique()))

#Number of trips
count_new = len(data_copy.index)/2
count_old = len(data.index)/2
print("Number of trips in the system: " + str(count_new))
print("Number of trips in the original data: " + str(count_old))

Nr of cars in the system: 255
Number of trips in the system: 146198.0
Number of trips in the original data: 149405.0


## Plot scatter plot

In [28]:
#plot_heat_map(start_data)
plot_scatter(data_copy, 1000, center)

## Plot n samples on grid 

In [53]:
plot_samples_on_grid(data_copy,400, excluded_zones)