In [1]:
import numpy as np
import pandas as pd
import math

def haversine_rad(lat1, lng1, lat2, lng2):
    dlng = lng2 - lng1
    dlat = lat2 - lat1
    
    a = math.sin(dlat / 2.0) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlng / 2.0) ** 2
    c = 2.0 * math.arcsin(math.sqrt(a))
    m = 6378137.0 * c
    return m

def vector_haversine_rad(lat1, lng1, lat2, lng2):
    dlng = lng2 - lng1
    dlat = lat2 - lat1
    
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng / 2.0) ** 2
    c = 2.0 * np.arcsin(np.sqrt(a))
    m = 6378137.0 * c
    return m

In [2]:
def preprocess_data():
    column_types = {'id': np.int_, 'date': np.string_, 'latitude': np.float_, 'longitude': np.float_}

    for i in range(10357):
        taxi = i + 1
        
        try:
            df = pd.read_csv("data/" + str(taxi) + ".txt", dtype=column_types, names=['id', 'date', 'longitude', 'latitude'])
            df.drop_duplicates(inplace=True)
            rad_lat = np.array(np.radians(df['latitude']))
            rad_lng = np.array(np.radians(df['longitude']))
            
            distances = vector_haversine_rad(rad_lat[:-1], rad_lng[:-1], rad_lat[1:], rad_lng[1:])

            df['rad_lat'] = rad_lat
            df['rad_lng'] = rad_lng
            df['dist'] = np.insert(distances, 0, 0)
            df = df.sort_values(by=['date'])
            df.to_csv("csv/taxi_" + str(taxi) + ".csv", index=False)
        except:
            print(taxi)

In [3]:
preprocess_data()