In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import ast
from datetime import datetime

## Preprocessing

In [19]:
# These are all of the files you are given
data = pd.read_csv("train.csv")
csv_df = pd.read_csv('metaData_taxistandsID_name_GPSlocation.csv', sep=',')



In [20]:
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

In [22]:
def apply_preprocessing(data):
    data = data[data['MISSING_DATA'] == False].copy()
    data.drop(['TRIP_ID','MISSING_DATA','CALL_TYPE','ORIGIN_CALL'], axis=1,inplace=True)
    data['TAXI_ID']=data['TAXI_ID'].astype(str).str[-3:]
    
    data = pd.merge(data, csv_df, left_on='ORIGIN_STAND', right_on='ID', how='left')
    data.drop(['Descricao','ID','ORIGIN_STAND'], axis=1, inplace=True)
    
    data["LEN"] = data["POLYLINE"].apply(polyline_to_trip_duration)
    
    data = data[(data['LEN'] <= 900) & (data['LEN'] >= 30)]
    
    first_longitude = data['POLYLINE'].apply(lambda x: float((x.split(',')[0])[2:]))
    data['Longitude'] = data['Longitude'].fillna(pd.Series(first_longitude))
    
    first_latitude = data['POLYLINE'].apply(lambda x: float((x.split(',')[1])[:-1]))
    data['Latitude'] = data['Latitude'].fillna(pd.Series(first_latitude))
    
    data[["YR", "MON", "DAY", "HR", "WK"]] = data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
    
    data = data.drop('TIMESTAMP', axis=1)
    data = data.drop('POLYLINE', axis=1)
    data = data.drop('YR', axis=1)
    data = data.drop('MON', axis=1)
    data = data.drop('DAY', axis=1)
    
    bins = [-1, 4, 8, 12, 16, 20, 24]
    labels = [0, 1, 2, 3, 4, 5]

    data['HOUR_GROUP'] = pd.cut(data['HR'], bins=bins, labels=labels, include_lowest=True)
    data.drop(['HR'], axis=1, inplace=True)
    
    data['TAXI_ID'] = data['TAXI_ID'].astype(int)
    data['DAY_TYPE'] = data['DAY_TYPE'].astype('category')
    
    return data
final_data = apply_preprocessing(data)

In [23]:
final_data.dtypes

TAXI_ID          int64
DAY_TYPE      category
Latitude        object
Longitude      float64
LEN              int64
WK               int64
HOUR_GROUP    category
dtype: object

In [3]:
data = data[data['MISSING_DATA'] == False]
data = data.drop('TRIP_ID', axis=1)
data = data.drop('MISSING_DATA', axis=1)
data = data.drop('CALL_TYPE', axis=1)
data = data.drop('ORIGIN_CALL', axis=1)

In [4]:
data['TAXI_ID']=data['TAXI_ID'].astype(str).str[-3:]

In [5]:
data.head()

Unnamed: 0,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE
0,,589,1372636858,A,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,7.0,596,1372637303,A,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,,320,1372636951,A,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,,520,1372636854,A,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,,337,1372637091,A,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [6]:

data = pd.merge(data, csv_df, left_on='ORIGIN_STAND', right_on='ID', how='left')
# Drop unnecessary columns
data.drop(['Descricao'], axis=1, inplace=True)
data.drop(['ID'], axis=1, inplace=True)
data.drop(['ORIGIN_STAND'], axis=1, inplace=True)

# Display the merged DataFrame
data.head()

Unnamed: 0,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,Latitude,Longitude
0,589,1372636858,A,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",,
1,596,1372637303,A,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",41.1599801853,-8.641984
2,320,1372636951,A,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",,
3,520,1372636854,A,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",,
4,337,1372637091,A,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",,


In [7]:
data["LEN"] = data["POLYLINE"].apply(polyline_to_trip_duration)

In [8]:
threshold = 900
minimum_len = 30

data = data[(data['LEN'] <= threshold) & (data['LEN'] >= minimum_len)]

In [9]:
first_longitude = data['POLYLINE'].apply(lambda x: (x.split(',')[0])[2:])
data['Longitude'] = data['Longitude'].fillna(pd.Series(first_longitude))

In [10]:
first_latitude = data['POLYLINE'].apply(lambda x: (x.split(',')[1])[:-1])
data['Latitude'] = data['Latitude'].fillna(pd.Series(first_latitude))

In [11]:
data[["YR", "MON", "DAY", "HR", "WK"]] = data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [None]:
data = data.drop('TIMESTAMP', axis=1)
data = data.drop('POLYLINE', axis=1)
data = data.drop('YR', axis=1)
data = data.drop('MON', axis=1)
data = data.drop('DAY', axis=1)

In [None]:
bins = [-1, 4, 8, 12, 16, 20, 24]
labels = [0, 1, 2, 3, 4, 5]

data['HOUR_GROUP'] = pd.cut(data['HR'], bins=bins, labels=labels, include_lowest=True)
data.drop(['HR'], axis=1, inplace=True)
data