In [1]:
import pandas as pd
import plotly.express as px
import json
import csv
import numpy
import geopy.distance
from sklearn.cluster import DBSCAN

In [2]:
dataset = pd.read_csv("taxi_partial.csv")

In [3]:
len(dataset)

320

In [4]:
# Remove corrupted data

# Remove tracks marked as having missing data
dataset_without_missing_data = dataset[dataset.MISSING_DATA == False]
print(len(dataset_without_missing_data))

# Remove tracks with an empty polyline
dataset_filtered = dataset[dataset.POLYLINE != "[]"]
print(len(dataset_filtered))

# Remove tracks consisting of less than 4 coordinate points
mask = numpy.core.defchararray.count(dataset_filtered.POLYLINE.values.astype(str), '[') >= 4
dataset_filtered = pd.DataFrame(dataset_filtered.values[mask], dataset_filtered.index[mask], dataset_filtered.columns)
print(len(dataset_filtered))


320
320
312


In [5]:
# Remove unnecessary columns and create list of coordinates rather than list of polylines. 

def create_list_of_coordinates (dataset):
    polylines = pd.DataFrame()
    lon, lat, id = [], [], []

    for i in range(len(dataset)):
        try:
            lst = json.loads(dataset["POLYLINE"].iloc[i])
            longi, latte = zip(*lst)
            lon = lon+ list(longi)
            lat=lat+list(latte)
            id = id+[dataset["TRIP_ID"].iloc[i] for j in range(len(lst))]
        except:
            print(dataset["TRIP_ID"].iloc[i])

    polylines["lat"]=lat
    polylines["lon"]=lon
    polylines["id"]=id
    return polylines

polylines = create_list_of_coordinates(dataset_filtered)
len(polylines)

14424

In [7]:
# Save list of polylines without the faulty rows. 
pd.DataFrame.to_csv(polylines, "./partial_preprocessed.csv", index=False)


In [6]:
# Fill in waypoints in the tracks to make alignments more accurate
# This functions fills in waypoints between two points


def fill_waypoints(point1, point2):
    vector = [point1[0] - point2[0], point1[1] - point2[1]]
    
    dist = geopy.distance.distance((point1[0], point1[1]), (point2[0], point2[1])).m
    n = int(dist / 20)
    coords = []
    coords.append([point1[0], point1[1]])
    for i in range(n):
        coords.append([coords[i][0] - (vector[0] / n), coords[i][1] - (vector[1] / n)])
    return coords

In [7]:
# Adds wayspoints between every point on a track

def add_points(track):
    full_track = []
    for line in range(len(track) - 1):
        coords = fill_waypoints(track[line], track[line + 1])
        full_track.extend(coords)
    return full_track


In [8]:
# Adds points for every track in a dataset

ids = dataset_filtered["TRIP_ID"].values.tolist()


def add_points_for_every_track(dataset):
    ids = dataset["TRIP_ID"].values.tolist()
    extended_dataset = pd.DataFrame(columns=["TRIP_ID", "POLYLINE"])
    for id in range(len(dataset)):
        coords = add_points(dataset["POLYLINE"].iloc[id])
        extended_dataset.loc[id] = ids[id], str(coords)
    return extended_dataset

In [13]:
extended_polyline = create_list_of_coordinates(extended_dataset)


In [92]:
pd.DataFrame.to_csv(extended_polyline, "./partial_added_points.csv", index=False)


In [9]:
# DBSCAN to filter out coordinates that are not close to other points.


def dbscan(polylines):
    dbscan = DBSCAN(eps=0.0005, min_samples=10)
    dbscan.fit(polylines[["lat", "lon"]])
    polylines["DBSCAN_labels"] = dbscan.labels_
    return polylines

In [10]:
# Divide the tracks into subtracks where coordintes have been marked as noise by DBSCAN


def divide_into_subtracks(polylines):
    new_ids = []
    j = 0
    for i in range(len(dataset_filtered.index)):
        id = dataset_filtered.iloc[i].TRIP_ID

        while j < len(polylines.index):
            if polylines.iloc[j].DBSCAN_labels == -1:
                new_ids = new_ids + [-1]
                j += 1
                continue
            elif (
                polylines.iloc[j].DBSCAN_labels > -1
                and polylines.iloc[j - 1].DBSCAN_labels == -1
                and polylines.iloc[j].id == polylines.iloc[j - 1].id
            ):
                new_ids = new_ids + [
                    str(polylines.iloc[j].id)
                    + str(polylines.iloc[j].DBSCAN_labels)
                    + str(j)
                ]
            else:
                if j == 0:
                    new_ids = new_ids + ["T1"]
                elif polylines.iloc[j].id != polylines.iloc[j - 1].id:
                    new_ids = new_ids + [polylines.iloc[j].id]
                else:
                    new_ids = new_ids + [new_ids[-1]]
            j += 1

    polylines["new_ids"] = new_ids
    return polylines

In [11]:
# Remove all noise coordinates


def create_subtracks(polylines):
    poly_with_dbscan = dbscan(polylines)
    subtracks = divide_into_subtracks(polylines)
    polylines_subtracks = subtracks[subtracks.DBSCAN_labels > -1]
    return polylines_subtracks

In [12]:
# Remove points outside of city centre


def remove_points_outside_city_centre(polylines):
    remove_lat1 = polylines[polylines.lat > 41.136271]
    remove_lat2 = remove_lat1[remove_lat1.lat < 41.176857]
    remove_lon1 = remove_lat2[remove_lat2.lon < -8.575663]
    polylines_subtracks_filtered = remove_lon1[remove_lon1.lon > -8.651527]
    return polylines_subtracks_filtered

In [71]:
fig = px.line_mapbox(polylines_subtracks_filtered, lat="lat", lon="lon", color="new_ids", zoom=3, height=700)

fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=12, mapbox_center_lat = 41.14,
    margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [58]:
# Save list of subtrack ids

ids = polylines_subtracks["new_ids"]
ids_subtracks = ids.drop_duplicates()
pd.DataFrame.to_csv(ids_subtracks, "./partial_subtracks_ids.csv", index=False)

In [37]:
# Save the list of subtracks to CSV file.

pd.DataFrame.to_csv(
    polylines_subtracks,
    "./partial_subtracks_with_extra_points_before_dividing.csv",
    index=False,
)

In [72]:
# Save list of subtracks where points outside of the city centre is removed

pd.DataFrame.to_csv(
    polylines_subtracks_filtered,
    "./partial_subtracks_with_extra_points_before_dividing_only_city_centre.csv",
    index=False,
)

In [52]:
# Create and save list of all original track-ids

ids = dataset_filtered["TRIP_ID"]
pd.DataFrame.to_csv(ids, "./partial_ids.csv", index=False)

In [19]:
# Create dataset for Needleman-Wunsch where subtracks are created before extra points are added

city_centre = remove_points_outside_city_centre(polylines)
subtracks = create_subtracks(city_centre)
ids = subtracks["new_ids"]
ids = ids.drop_duplicates()
subtracks_polyline = pd.DataFrame(columns=["TRIP_ID", "POLYLINE"])
for i in range(len(ids)):
    track = subtracks[subtracks.new_ids == ids.iloc[i]]
    polyline = list(
        map(
            lambda x: list(x),
            zip(track["lon"].values.tolist(), track["lat"].values.tolist()),
        )
    )

    subtracks_polyline.loc[i] = [ids.iloc[i], polyline]

subtracks_added_points = add_points_for_every_track(subtracks_polyline)
# Remove tracks with an empty polyline
subtracks_filtered = subtracks_added_points[subtracks_added_points.POLYLINE != "[]"]

# Remove tracks consisting of less than 4 coordinate points
mask = (
    numpy.core.defchararray.count(subtracks_filtered.POLYLINE.values.astype(str), "[")
    >= 4
)
subtracks_filtered = pd.DataFrame(
    subtracks_filtered.values[mask],
    subtracks_filtered.index[mask],
    subtracks_filtered.columns,
)

subtracks_coords = create_list_of_coordinates(subtracks_filtered)

print(subtracks_coords)

             lat       lon          id
0      41.148522 -8.585676          T1
1      41.148639 -8.585712          T1
2      41.148855 -8.585685          T1
3      41.148855 -8.585685          T1
4      41.148927 -8.585730          T1
...          ...       ...         ...
23491  41.152545 -8.629740  T324238722
23492  41.152617 -8.629776  T324238722
23493  41.152635 -8.629803  T324238722
23494  41.152608 -8.629776  T324238722
23495  41.152617 -8.629767  T324238722

[23496 rows x 3 columns]


In [20]:
pd.DataFrame.to_csv(
    subtracks_coords,
    "./partial_subtracks_with_extra_points_after_dividing_only_city_centre.csv",
    index=False,
)

In [21]:
ids = subtracks_filtered["TRIP_ID"]
ids_subtracks = ids.drop_duplicates()
pd.DataFrame.to_csv(
    ids_subtracks, "./partial_subtracks_ids_add_points_after_dividing.csv", index=False
)