# Install

# pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116

# pip install rtdl

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import sklearn.model_selection
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.dlpack import to_dlpack, from_dlpack
# import rtdl
import urllib
import PIL
import requests
import datetime
import holidays
import os
import gc
import cudf as cf
import cupy as cp
from tqdm import tqdm



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

# Data Preprocess

## Remove data outside of boundary

In [3]:
def select_within_boundary(df, boundary) -> bool:
    return (
        (df["pickup_longitude"] >= boundary["longitude_min"])
        & (df["pickup_longitude"] <= boundary["longitude_max"])
        & (df["pickup_latitude"] >= boundary["latitude_min"])
        & (df["pickup_latitude"] <= boundary["latitude_max"])
        & (df["dropoff_longitude"] >= boundary["longitude_min"])
        & (df["dropoff_longitude"] <= boundary["longitude_max"])
        & (df["dropoff_latitude"] >= boundary["latitude_min"])
        & (df["dropoff_latitude"] <= boundary["latitude_max"])
    )


def select_in_boundary(df: cf.DataFrame) -> cf.DataFrame:
    boundary = {
        "longitude_min": -74.5,
        "longitude_max": -72.8,
        "latitude_min": 40.5,
        "latitude_max": 41.8,
    }

    return df[select_within_boundary(df, boundary)]

## Drop data on water

In [4]:
mask_url = urllib.request.urlopen("https://imgur.com/XGHkdoK.png")
mask = np.array(PIL.Image.open(mask_url))[:, :, 0] > 0.92

mask = np.c_[mask, np.full([mask.shape[0], 1], False)]
mask = np.r_[mask, np.full([1, mask.shape[1]], False)]
mask = cp.asarray(mask)


def drop_on_water(df: cf.DataFrame) -> cf.DataFrame:
    def lonlat_to_xy(longitude, latitude, x_range, y_range, boundary):
        longitude_range = boundary["longitude_max"] - boundary["longitude_min"]
        latitude_range = boundary["latitude_max"] - boundary["latitude_min"]

        x = x_range * (longitude - boundary["longitude_min"]) / longitude_range
        y = (
            y_range
            - y_range * (latitude - boundary["latitude_min"]) / latitude_range
        )

        return (x.astype("uint8"), y.astype("uint8"))

    boundary = {
        "longitude_min": -74.5,
        "longitude_max": -72.8,
        "latitude_min": 40.5,
        "latitude_max": 41.8,
    }

    pickup_x, pickup_y = lonlat_to_xy(
        df.loc[:, "pickup_longitude"],
        df.loc[:, "pickup_latitude"],
        mask.shape[1] - 1,
        mask.shape[0] - 1,
        boundary,
    )

    dropoff_x, dropoff_y = lonlat_to_xy(
        df.loc[:, "dropoff_longitude"],
        df.loc[:, "dropoff_latitude"],
        mask.shape[1] - 1,
        mask.shape[0] - 1,
        boundary,
    )

    on_land = mask[pickup_y, pickup_x] & mask[dropoff_y, dropoff_x]
    return df[on_land]

In [5]:
def drop_same_pick_drop(df: cf.DataFrame):
    filter = (df["pickup_longitude"] == df["dropoff_longitude"]) & (
        df["pickup_latitude"] == df["dropoff_latitude"]
    )

    return df[~filter]


## Feature engineering

In [6]:
def get_lat_lon(df: cf.DataFrame, unit="deg"):
    # Return lat, lon in radian
    lat1 = df["pickup_latitude"].copy().to_cupy()
    lon1 = df["pickup_longitude"].copy().to_cupy()
    lat2 = df["dropoff_latitude"].copy().to_cupy()
    lon2 = df["dropoff_longitude"].copy().to_cupy()

    if unit == "rad":
        lat1, lon1, lat2, lon2 = map(cp.radians, [lat1, lon1, lat2, lon2])

    # 1 degree of latitude = 69.172 miles, 1 degree of longitude = 50 miles
    if unit == "mile":
        lat1 *= 69.172
        lon1 *= 50
        lat2 *= 69.172
        lon2 *= 50

    return lat1, lon1, lat2, lon2


def cal_rotated_coordinate(lat1, lon1, lat2, lon2) -> cp.ndarray:
    lat1, lon1, lat2, lon2 = map(
        lambda x: torch.as_tensor(x).to(device), [lat1, lon1, lat2, lon2]
    )
    p1 = torch.column_stack([lat1, lon1])
    p2 = torch.column_stack([lat2, lon2])
    
    theta = -np.radians(29).astype("float32")

    rot = torch.tensor(
        [[np.cos(theta), np.sin(theta)], [-np.sin(theta), np.cos(theta)]]
    ).to(device)

    # Perform rotate row by row and split
    lat1, lon1 = torch.hsplit(torch.einsum("ij, mj -> mi", rot, p1), 2)
    lat2, lon2 = torch.hsplit(torch.einsum("ij, mj -> mi", rot, p2), 2)
    lat1, lon1, lat2, lon2 = map(
        lambda x: cp.asarray(x.squeeze(1)),
        [lat1, lon1, lat2, lon2],
    )
    
    return lat1, lon1, lat2, lon2


def get_rotated_coordinate(df: pd.DataFrame):
    lat1, lon1, lat2, lon2 = get_lat_lon(df, "deg")
    header = [
        "rotated_pickup_latitude",
        "rotated_pickup_longitude",
        "rotated_dropoff_latitude",
        "rotated_dropoff_longitude",
    ]
    
    coordinates = cal_rotated_coordinate(lat1, lon1, lat2, lon2)

    for head, coordinate in zip(header, coordinates):
        df.loc[:, head] = coordinate

    return df

### Add distance (in miles)

Using Manhattan distance and rotate 29 degree to fit the real street block

In [7]:
def get_euclidean(df: cf.DataFrame):
    lat1, lon1, lat2, lon2 = get_lat_lon(df, "mile")
    return cp.linalg.norm(
        cp.column_stack([lat1, lon1]) - cp.column_stack([lat2, lon2]), axis=1
    )


def cal_haversine_distance(lat1, lon1, lat2, lon2):
    dlat = lat1 - lat2
    dlon = lon1 - lon2

    tmp = (
        cp.sin(dlat / 2.0) ** 2
        + cp.cos(lat1) * cp.cos(lat2) * cp.sin(dlon / 2.0) ** 2
    )

    return 2 * cp.arcsin(cp.sqrt(tmp))


def get_haversine_distance(df: cf.DataFrame):
    # Return haversine distance in miles
    lat1, lon1, lat2, lon2 = get_lat_lon(df, "rad")
    return cal_haversine_distance(lat1, lon1, lat2, lon2) * 3959


def get_correct_manhattan(df: cf.DataFrame):
    lat1, lon1, lat2, lon2 = get_lat_lon(df, "mile")
    lat1, lon1, lat2, lon2 = cal_rotated_coordinate(lat1, lon1, lat2, lon2)

    dlat = abs(lat1 - lat2)
    dlon = abs(lon1 - lon2)

    return (dlat + dlon).ravel()


### Add temperature and precipitation 

In [8]:
def get_historical_temp_precipitation():
    url = f"https://archive-api.open-meteo.com/v1/archive?latitude=40.71&longitude=-74.01&start_date=2009-01-01&end_date=2015-12-31&hourly=apparent_temperature,precipitation"
    response = requests.get(url)
    data = response.json()

    df_tmp = cf.DataFrame(data["hourly"])
    df_tmp["time"] = cf.to_datetime(df_tmp["time"])

    return df_tmp.set_index("time").to_dict()


def convert_time(x: pd.Series) -> pd.Series:
    year = x.dt.year.rename("year")
    month = x.dt.month.rename("month").astype("uint8")
    day = x.dt.day.rename("day").astype("uint8")
    hour = x.dt.hour.rename("hour").astype("uint8")
    
    df_date = cf.concat([year, month, day, hour], axis=1)

    return cf.to_datetime(df_date)


def add_temp_precipitation(df: pd.DataFrame):
    date_time = convert_time(df["pickup_datetime"])
    df.loc[:,"apparent_temperature"] = date_time.map(
        temp_dict["apparent_temperature"]).astype("float32")
    df.loc[:, "precipitation"] = date_time.map(
        temp_dict["precipitation"]).astype("float32")

    return df


temp_dict = get_historical_temp_precipitation()


## Split time and add holiday

In [9]:
# add time information
def add_time_and_holiday_info(df: cf.DataFrame) -> cf.DataFrame:
    # Add time information
    df.loc[:, "year"] = df.pickup_datetime.dt.year
    df.loc[:, "weekday"] = df.pickup_datetime.dt.weekday.astype("uint8")
    df.loc[:, "hour"] = df.pickup_datetime.dt.hour.astype("uint8")

    # Add holiday information
    us_holidays = holidays.US()
    df.loc[:, "is_holiday"] = convert_time(df["pickup_datetime"]).isin(us_holidays).astype(
        "uint8"
    )

    return df


### Add pick_up/ drop_off airport feature

In [10]:
def check_nearby_airports(df, range=1):
    # Check the pickup and dropoff is near the airport (range in miles)
    # New York city
    nyc = (-74.0063889, 40.7141667)

    # JFK airport coordinates, see https://www.travelmath.com/airport/JFK
    jfk = (-73.7822222222, 40.6441666667)

    # Newark Liberty International Airport, see https://www.travelmath.com/airport/EWR
    ewr = (-74.175, 40.69)

    # LaGuardia Airport, see https://www.travelmath.com/airport/LGA
    lgr = (-73.87, 40.77)

    airports = {"JFK": jfk, "EWR": ewr, "LGR": lgr}

    # Add airport_nearby column
    near_pickup = cp.zeros(len(df))
    near_dropoff = cp.zeros(len(df))

    for airport, loc in airports.items():
        idx_pickup = (
            cal_haversine_distance(
                df["pickup_latitude"], df["pickup_longitude"], loc[1], loc[0]
            )
            * 3959
            < range
        )
        idx_dropoff = (
            cal_haversine_distance(
                df["dropoff_latitude"], df["dropoff_longitude"], loc[1], loc[0]
            )
            * 3959
            < range
        )

        if airport == "JFK":
            near_pickup[idx_pickup] = 1
            near_dropoff[idx_dropoff] = 1
        elif airport == "EWR":
            near_pickup[idx_pickup] = 2
            near_dropoff[idx_dropoff] = 2
        elif airport == "LGR":
            near_pickup[idx_pickup] = 3
            near_dropoff[idx_dropoff] = 3

    df["airport_nearby_pickup"] = near_pickup.astype("uint8")
    df["airport_nearby_dropoff"] = near_dropoff.astype("uint8")

    return df

# All Preprocess

In [11]:
def date_format(df: cf.DataFrame) -> cf.DataFrame:
    df = df.copy()
    date_time = df["pickup_datetime"].copy()

    date_time = date_time.str.slice(0, 16)
    date_time = cf.to_datetime(date_time, utc=True, format="%Y-%m-%d %H:%M")

    df["pickup_datetime"] = date_time
    return df


def clean_data(df: cf.DataFrame) -> cf.DataFrame:
    df = df.copy()
    # Drop negative fare amount
    df = df[df.fare_amount > 0]
    # Drop nan value
    df = df.dropna()
    # Drop data out of boundary
    df = select_in_boundary(df)
    # Drop data on water
    df = drop_on_water(df)
    # Drop same pickup and dropoff data
    df = drop_same_pick_drop(df)
    df = df.reset_index(drop=True)
    return df


def engineering(df: cf.DataFrame):
    df = df.copy()
    # Add rotate coordinate
    df = get_rotated_coordinate(df)
    print(1)
    # Add distance
    df.loc[:, "euclidean"] = get_euclidean(df)
    print(2)
    df.loc[:, "haversine"] = get_haversine_distance(df)
    print(3)
    df.loc[:, "correct_manhattan"] = get_correct_manhattan(df)
    print(4)
    # Add Temp and precipitation
    df = add_temp_precipitation(df)
    print(5)
    # Split time and add holiday
    df = add_time_and_holiday_info(df)
    print(6)
    # Add pickup/dropoff airport
    df = check_nearby_airports(df)
    print(7)

    return df


In [12]:
# train_type = {
#     "fare_amount": "float32",
#     "pickup_datetime": "str",
#     "pickup_longitude": "float32",
#     "pickup_latitude": "float32",
#     "dropoff_longitude": "float32",
#     "dropoff_latitude": "float32",
#     "passenger_count": "uint8",
# }

# use_cols = list(train_type.keys())


In [13]:
# dir = "/kaggle/input/new-york-city-taxi-fare-prediction/"
# df_list = []
# with pd.read_csv(
#     os.path.join(dir, "train.csv"),
#     dtype=train_type,
#     usecols=use_cols,
#     chunksize=1_000_000,
# ) as reader:
#     for df_chunk in tqdm(reader):
#         df_chunk = date_format(df_chunk)
#         df_chunk = clean_data(df_chunk)
#         df_chunk = engineering(df_chunk)
#         df_list.append(df_chunk)
#         gc.collect()

In [14]:
df_train = cf.read_feather("/kaggle/input/taxi-prediction-compress-train/compress_train.feather")



In [15]:
df_train = clean_data(df_train)

In [16]:
df_train = engineering(df_train)
df_train.head()

1
2
3
4
5
6
7


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,rotated_pickup_latitude,rotated_pickup_longitude,rotated_dropoff_latitude,...,haversine,correct_manhattan,apparent_temperature,precipitation,year,weekday,hour,is_holiday,airport_nearby_pickup,airport_nearby_dropoff
0,4.5,2009-06-15 17:26:00,-73.844315,40.721317,-73.841614,40.712276,1,71.4161,-44.843605,71.406891,...,0.640716,0.797852,18.9,0.0,2009,0,17,0,0,0
1,16.9,2010-01-05 16:52:00,-74.016045,40.711304,-73.979271,40.782005,1,71.490601,-44.998665,71.534607,...,5.250763,7.364746,-8.9,0.0,2010,1,16,0,0,0
2,5.7,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2,71.51815,-44.945301,71.512909,...,0.863538,1.173096,26.1,0.0,2011,3,0,0,0,0
3,7.7,2012-04-21 04:30:00,-73.987129,40.733143,-73.99157,40.758091,1,71.495682,-44.962784,71.519653,...,1.739627,2.259766,13.8,0.0,2012,5,4,0,0,0
4,5.3,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1,71.516953,-44.92923,71.525177,...,1.242127,1.704468,-2.5,0.0,2010,1,7,0,0,0


In [17]:
df_train.to_feather("/kaggle/working/compress_train_with_new.feather")



# FT Transformer

In [18]:
# df_train.head()

In [19]:
# df_test.head()

## Premodel process function

### Numerical, category, target split

In [20]:
# def num_cat_split(df, type="train"):
#     numerical_idx = [
#         "pickup_longitude",
# #         "pickup_latitude",
#         "dropoff_longitude",
# #         "dropoff_latitude",
# #         "passenger_count",
#         "distance",
# #         "precipitation",
# #         "apparent_temperature",
#         "abs_loni_diff",
#         "abs_lati_diff",
#     ]

#     category_idx = [
#         "year",
#         "weekday",
#         "hour",
#         "is_holiday",
#         "airport_nearby_pickup",
#         "airport_nearby_dropoff",
#     ]

#     df_num = df.loc[:, numerical_idx].copy()
#     df_cat = df.loc[:, category_idx].copy()
#     if type == "train":
#         df_y = df.loc[:, "fare_amount"].copy()

#     return (df_num, df_cat, df_y) if type == "train" else (df_num, df_cat)

### Category data encoding

In [21]:
# def cat_encoding(df_cat: cf.DataFrame):
#     for index, val in df_cat.items():
#         df_cat.loc[:, index] = LabelEncoder().fit_transform(val.to_numpy())

#     return df_cat.copy()

### Train, validate split

In [22]:
# def train_valid_spilt(df_num, df_cat, df_y):
#     x_num = df_num.to_numpy().astype("float32")
#     x_cat = df_cat.to_numpy().astype("int64")
#     y = df_y.to_numpy().astype("float32")

#     X_num = {}
#     X_cat = {}
#     Y = {}

#     (
#         X_num["train"],
#         X_num["val"],
#         X_cat["train"],
#         X_cat["val"],
#         Y["train"],
#         Y["val"],
#     ) = sklearn.model_selection.train_test_split(
#         x_num, x_cat, y, train_size=0.7, shuffle=True
#     )
    
#     X_num = {k: cp.asarray(v) for k, v in X_num.items()}
#     X_cat = {k: cp.asarray(v) for k, v in X_cat.items()}
#     Y = {k: cp.asarray(v) for k, v in Y.items()}

#     return X_num, X_cat, Y

### Target standardize

In [23]:
# def target_standardize(Y: dict):
#     y_mean = Y["train"].mean().item()
#     y_std = Y["train"].std().item()
#     return y_mean, y_std, {k: ((v - y_mean) / y_std) for k, v in Y.items()}

## Premodel process

In [24]:
# def premodel_process(df_train: pd.DataFrame, df_test: pd.DataFrame):
#     df_train = df_train.copy()
#     df_test = df_test.copy()

#     # Numerical, category data split
#     df_num_train, df_cat_train, df_y_train = num_cat_split(df_train)
#     df_num_test, df_cat_test = num_cat_split(df_test, type="test")

#     # Category data encoding
#     df_cat_train = cat_encoding(df_cat_train)
#     df_cat_test = cat_encoding(df_cat_test)

#     # Train, validate split and turn to numpy array
#     X_num, X_cat, Y = train_valid_spilt(df_num_train, df_cat_train, df_y_train)

#     X_num["test"] = df_num_test.values.astype("float32")
#     X_cat["test"] = df_cat_test.values.astype("int64")

#     # Numerical data standardize
#     standardizer = StandardScaler().fit(X_num["train"].get())
#     X_num = {k: standardizer.transform(v.get()) for k, v in X_num.items()}

#     # Target standardize
#     y_mean, y_std, Y = target_standardize(Y)

#     cat_cardinalities = rtdl.data.get_category_sizes(X_cat["train"])

#     # Ndarray to tensor
#     X_num = {k: torch.as_tensor(v, device=device) for k, v in X_num.items()}
#     X_cat = {k: torch.as_tensor(v, device=device) for k, v in X_cat.items()}
#     Y = {k: torch.as_tensor(v, device=device) for k, v in Y.items()}

#     return X_num, X_cat, Y, y_mean, y_std, cat_cardinalities


In [25]:
# X_num, X_cat, Y, y_mean, y_std, cat_cardinalities = premodel_process(
#     df_train, df_test
# )

In [26]:
# del df_train, df_test
# gc.collect()

### Pack data in dataset

In [27]:
# class TaxiDataset(Dataset):
#     def __init__(self, x_num, x_cat, y) -> None:
#         self.x_num = x_num
#         self.x_cat = x_cat
#         self.y = y

#     def __len__(self):
#         return len(self.y)

#     def __getitem__(self, index):
#         return self.x_num[index], self.x_cat[index], self.y[index]


### Model and Evaluater

In [28]:
# model = rtdl.FTTransformer.make_default(
#     n_num_features=X_num["train"].shape[1],
#     cat_cardinalities=cat_cardinalities,
#     last_layer_query_idx=[
#         -1
#     ],  # it makes the model faster and does NOT affect its output
#     d_out=1,
# )


# model.to(device)

# optimizer = model.make_default_optimizer()
# loss_fn = F.smooth_l1_loss


# class EarlyStopper:
#     def __init__(self, patience=100, min_delta=10e-5) -> None:
#         self.patience = patience
#         self.min_delta = min_delta
#         self.counter = 0
#         self.min_val_score = np.inf

#     def early_stop(self, val_score) -> bool:
#         if val_score < self.min_val_score:
#             self.min_val_score = val_score
#             self.counter = 0

#         elif val_score >= (self.min_val_score + self.min_delta):
#             self.counter += 1
#             if self.counter > self.patience:
#                 return True

#         return False


In [29]:
# @torch.no_grad()
# def evaluate(dataloader: DataLoader):
#     model.eval()

#     target = dataloader.dataset.y
#     pred = []
#     for iteration, data in enumerate(dataloader):
#         x_num_batch = data[0]
#         x_cat_batch = data[1]

#         pred.append(model(x_num_batch, x_cat_batch).squeeze(1))

#     pred = torch.cat(pred)

#     return sklearn.metrics.mean_squared_error(pred.cpu().numpy(), target.cpu().numpy()) ** 0.5 + y_std


## Set dataloader

In [30]:
# batch_size = 128

# val_dataset = TaxiDataset(X_num["val"], X_cat["val"], Y["val"])
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# train_dataset = TaxiDataset(X_num["train"], X_cat["train"], Y["train"])
# train_dataloader = DataLoader(
#     train_dataset, batch_size=batch_size, shuffle=True
# )

# early_stopper = EarlyStopper(min_delta=10e-6)

In [31]:
# # Pre train score
# evaluate(val_dataloader)


In [32]:
# n_epoch = 10
# report_frequency = len(train_dataloader) // 5

# for epoch in range(1, n_epoch + 1):
#     for iteration, data in enumerate(train_dataloader):
#         model.train()
#         optimizer.zero_grad()

#         x_num_batch = data[0]
#         x_cat_batch = data[1]
#         y_batch = data[2]

#         loss = loss_fn(model(x_num_batch, x_cat_batch).squeeze(1), y_batch)
#         loss.backward()
#         optimizer.step()

#         if iteration % report_frequency == 0:
#             print(
#                 f"(epoch) {epoch} (batch) {iteration} (loss) {loss.item():.4f}"
#             )

#     val_score = evaluate(val_dataloader)
#     print(f"Epoch {epoch:03d} | Validation score: {val_score:.4f}")
#     if early_stopper.early_stop(val_score):
#         print(f"Early stop!")
#         break

### Get Prediction

In [33]:
# @torch.no_grad()
# def prediction():
#     model.eval()
#     y_pred = model(X_num["test"], X_cat["test"])
#     return (y_pred * y_std + y_mean).cpu().detach().numpy()

In [34]:
# cf.concat(
#     [df_keys, cf.Series(prediction().ravel(), name="fare_amount")], axis=1
# ).to_csv("/kaggle/working/output.csv", index=False)
