# New York City Taxi Fare Prediction 

## ML Training and Prediction in Liten Cache
* Liten database can serve in batch and maintain versions using a multi-version control system
* It integrates data from different sources that Data Science/ML engineer need not worry about
* Cognitive Services - Trained models performing predictive actions for target markets
* ML Operations 
    * Data serving in batches
    * Model, Features, Embeddings versions and registering
    * Deployment with single statements on different clouds

XG Boost example here shows the following
* train using XGBoost in pandas
* predict using models in liten

Import pandas, xgboost, arrow and liten

In [None]:
import os

import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm

import pyarrow as pa
from pyarrow import csv
import litendb as ten

## Data cleaning & Feature engineering Methods

In [None]:
df = pd.DataFrame()
df.dropna()

In [None]:
def remove_outliers(df):
    df = df.dropna()
    mask = df['fare_amount'].between(0, 500)
    mask &= df['passenger_count'].between(0, 6)

    # to select coordinates: https://www.openstreetmap.org/export
    mask &= df['pickup_longitude'].between(-75, -73)
    mask &= df['dropoff_longitude'].between(-75, -73)
    mask &= df['pickup_latitude'].between(40, 42)
    mask &= df['dropoff_latitude'].between(40, 42)

    return df[mask]

def manhattan(pickup, dropoff):
    pickup_long, pickup_lat = pickup
    dropoff_long, dropoff_lat = dropoff
    return np.abs(dropoff_long - pickup_long) + np.abs(dropoff_lat - pickup_lat)

def extract_distance_features(df):
    df['abs_diff_longitude'] = (df['dropoff_longitude'] - df['pickup_longitude']).abs()
    df['abs_diff_latitude'] = (df['dropoff_latitude'] - df['pickup_latitude']).abs()

    pickup = (df['pickup_longitude'], df['pickup_latitude'])
    dropoff = (df['dropoff_longitude'], df['dropoff_latitude'])
    df['distance'] = manhattan(pickup, dropoff)

    # Distances to nearby airports, and city center
    # https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
    coordinates = {
        'nyc': (-74.0063889, 40.7141667),
        'jfk': (-73.7822222222, 40.6441666667),
        'ewr': (-74.175, 40.69),
        'lgr': (-73.87, 40.77)
    }

    for name, coord in coordinates.items():
        df[f'pickup_distance_to_{name}'] = manhattan(coord, pickup)
        df[f'dropoff_distance_to_{name}'] = manhattan(coord, dropoff)

    return df

def extract_datetime_features(df):
    # Removing unecessary information from the datetime string
    # https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
    pickup_datetime = df['pickup_datetime'].str.slice(0, 16)
    pickup_datetime = pd.to_datetime(pickup_datetime, utc=True, format='%Y-%m-%d %H:%M')

    df['year'] = pickup_datetime.dt.year
    df['month'] = pickup_datetime.dt.month
    df['day'] = pickup_datetime.dt.day
    df['dayofweek'] = pickup_datetime.dt.dayofweek
    df['hour'] = pickup_datetime.dt.hour

    return df.drop(columns='pickup_datetime')

def extract_features(df):
    df = extract_distance_features(df)
    df = extract_datetime_features(df)
    return df

### Add arrow csv to liten cache
Create a new cache, add NY Taxi data to cache

In [None]:
tc = ten.Cache()

In [None]:
import os
from huggingface_hub import snapshot_download
dataset = 'nyyellowtaxi'
local_dir=f'./{dataset}/'
if not os.path.exists(local_dir):
    print(f'Downloading dataset into {local_dir} ...')
    snapshot_download(
        repo_id="hkverma/"+dataset,
        repo_type="dataset",
        local_dir=local_dir,
        local_dir_use_symlinks=False  # ensures real copies, not symlinks
    )
    print('Download Complete.')
else:
    print(f"Dataset already exists in {local_dir}, skipping download.")  
nytaxi_data_dir=local_dir
os.listdir(nytaxi_data_dir)
nytaxi_train_file = nytaxi_data_dir+'train.csv'

In [None]:
convert_schema = pa.csv.ConvertOptions(
    column_types={
        'fare_amount': 'float32',
        'pickup_datetime': 'string',
        'pickup_longitude': 'float32',
        'pickup_latitude': 'float32',
        'dropoff_longitude': 'float32',
        'dropoff_latitude': 'float32',
        'passenger_count': 'uint8'
    }
)
train_pa = pa.csv.read_csv(input_file=nytaxi_train_file, convert_options=convert_schema)
table_name = tc.add_table("nyt_train", train_pa, tc.FactTable)

In [None]:
tc.info()

Create val pandas df for XGBoost

In [None]:
val_size=100
dtypes = {'fare_amount': 'float32',
          'pickup_datetime': 'str',
          'pickup_longitude': 'float32',
          'pickup_latitude': 'float32',
          'dropoff_longitude': 'float32',
          'dropoff_latitude': 'float32',
          'passenger_count': 'uint8'}

input_file = nytaxi_data_dir+'train.csv'
   
val_df = tc.slice(table_name="nyt_train", offset=0,length=val_size).to_pandas()
val_df = remove_outliers(val_df)
val_df = extract_features(val_df)
val_df = val_df.drop(columns='key')

X_val = val_df.drop(columns='fare_amount')
y_val = val_df[['fare_amount']]

dval = xgb.DMatrix(X_val, y_val, feature_names=X_val.columns.tolist())

## Batch training 
Read batched data using Liten data tensor slices

In [None]:
params = {'learning_rate': 0.05,
          'max_depth': 7,
          'objective': 'reg:linear',
          'eval_metric': 'rmse',
          'subsample': 0.8,
          'gamma': 1,
          'silent': True,
          'verbose_eval': True}
num_rounds = 16
model = None
batch_size = 1000
val_size = 100
    
remaining_rows = train_pa.num_rows - val_size
offset = val_size

while (remaining_rows > 0):
    batch_df = tc.slice(table_name="nyt_train", offset=offset, length=batch_size).to_pandas()
    remaining_rows -= batch_size
    offset += batch_size
    
    batch_df = remove_outliers(batch_df)
    batch_df = extract_features(batch_df)

    batch_df = batch_df.drop(columns='key')
    X_train = batch_df.drop(columns='fare_amount')
    y_train = batch_df[['fare_amount']]
    dtrain = xgb.DMatrix(X_train, y_train, feature_names=X_train.columns.tolist())
    
    model = xgb.train(params, dtrain, num_rounds, early_stopping_rounds=5,
                      evals=[(dtrain, 'train'), (dval, 'eval')],
                      xgb_model=model)

In [None]:
xgb.plot_importance(model)

## Predictions
Read test data from liten, add predicted columns. Prediction is to be added as Liten method as well.

In [None]:
nytaxi_test_file = nytaxi_data_dir+'test.csv'
convert_schema = pa.csv.ConvertOptions(
    column_types={
        'key': 'string',
        'pickup_datetime': 'string',
        'pickup_longitude': 'float32',
        'pickup_latitude': 'float32',
        'dropoff_longitude': 'float32',
        'dropoff_latitude': 'float32',
        'passenger_count': 'uint8'
    }
)
test_pa = pa.csv.read_csv(input_file=nytaxi_test_file, convert_options=convert_schema)
table_name = tc.add_table("nyt_test", test_pa, tc.FactTable)

In [None]:
test_pa = tc.slice(table_name="nyt_test", offset=0, length=20)
test_df = test_pa.to_pandas()
test_df = extract_features(test_df)
test_df.dtypes

In [None]:
X_test = test_df.drop(columns='key')
dtest = xgb.DMatrix(X_test, feature_names=X_test.columns.tolist())
y_pred = model.predict(dtest)

In [None]:
pred_arr = pa.array(y_pred)
test_pa = test_pa.append_column('predicted_fare_amount', pred_arr)

In [None]:
test_pa.to_pandas()