### Model training

The following approaches are considered:
- Multi classification problem with clustered data
- Mutli regression problem with two outputs (longitude/latitude)

In terms of the data we have the following approaches:
- fixed sequence length - take N points from beginning of POLYLINE and N points from end of polyline 


Algorithms:
- Long term short term NN (multi-class classification and regression)
    - able to handle variable sequence length, therefore the total trip POLYLINE can be used
- Random forest(regression and classification) 
    - can handle outliers well as dataset still contains outliers
    - runs efficiently on large data set
    
Metrics:
- Classification of clusters: AUC + Avg distance of last point to cluster center
- Regression: MAPE + Avg distance of last point to cluster center


In [1]:
import os,sys
import pandas as pd 
import numpy as np 
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
is_sagemaker_notebook = False

In [3]:
if is_sagemaker_notebook:
    prefix = "/home/ec2-user/SageMaker"
else:
    prefix = "/Users/Q619505/PycharmProjects/personal-projects/taxi-destination-prediction"

utils_path = os.path.join(f'{prefix}/src/utils/')
pp_path = os.path.join(f'{prefix}/src/preprocessing/')
model_path = os.path.join(f'{prefix}/src/modelling/')

if utils_path not in sys.path:
    sys.path.append(utils_path)

if pp_path not in sys.path:
    sys.path.append(pp_path)

if model_path not in sys.path:
    sys.path.append(model_path)

In [4]:
from data_cleaning import create_fix_length_sequences, split_lat_lon
from geo_spatial import convert_polyline_to_geojson_format, convert_string_to_geojson, haversine_distance, pair_wise_haversine_distance
from model import MultioutputModel

In [5]:
if is_sagemaker_notebook:
    train_data = pd.read_parquet(f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_train.parquet')
    test_data = pd.read_parquet(f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_test.parquet')
else:
    train_data = pd.read_csv(f'{prefix}/data/processed/train_data_encoded.csv', header=0,index_col=False)
    test_data = pd.read_csv(f'{prefix}/data/processed/test_data_encoded.csv', header=0, index_col=False)
    

### Multioutput NN model - 8 point sequence

- Time series of first 8 points and 8 last points - haversine distance between points 


Limit to only necessary attribtues

In [8]:
X_train = train_data[['trip_id', 'final_point', 'polyline', 'n_coordinate_points', 'total_distance_km','total_flight_time_minutes']]
X_test = test_data[['trip_id', 'final_point','polyline',  'n_coordinate_points', 'total_distance_km','total_flight_time_minutes']]

NameError: name 'train_data' is not defined

In [11]:
X_train = convert_polyline_to_geojson_format(X_train, 'polyline').rename(columns={'polyline': 'sequence'})
X_test = convert_polyline_to_geojson_format(X_test, 'polyline').rename(columns={'polyline': 'sequence'})

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x10768e8d0>>
Traceback (most recent call last):
  File "/Users/Q619505/Library/Caches/pypoetry/virtualenvs/taxi-destination-prediction-tbwwnVrE-py3.11/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [54]:
X_train = create_fix_length_sequences(X_train, 8, 'sequence')
X_test = create_fix_length_sequences(X_test, 8, 'sequence')

In [103]:
X_train['distance_sequence'] = X_train.sequence_transformed.apply(lambda sequence_: pair_wise_haversine_distance(sequence_))
X_test['distance_sequence'] = X_test.sequence_transformed.apply(lambda sequence_: pair_wise_haversine_distance(sequence_))

In [105]:
X_train, y_train = X_train[['distance_sequence']], X_train[['final_point']]
X_test, y_test = X_test[['distance_sequence']], X_test[['final_point']]

In [143]:
X_train = pd.DataFrame(sequence_ for sequence_ in X_train.distance_sequence).add_prefix('x_')
X_test = pd.DataFrame(sequence_ for sequence_ in X_test.distance_sequence).add_prefix('x_')

In [146]:
X_train.to_csv('X_train_distance_sequence.csv', header=True, index=False)
X_test.to_csv('X_test_distance_sequence.csv', header=True, index=False)

In [161]:
y_train = convert_polyline_to_geojson_format(y_train, 'final_point').split_lat_lon(y_train, 'final_point')

In [166]:
y_train.lon_final_point = y_train.lon_final_point.astype(float)
y_train.lat_final_point = y_train.lat_final_point.astype(float)

In [168]:
y_train = y_train.drop(['final_point'], axis=1)

In [169]:
y_test = convert_polyline_to_geojson_format(y_test, 'final_point').split_lat_lon(y_test, 'final_point')

In [171]:
y_test.lon_final_point = y_test.lon_final_point.astype(float)
y_test.lat_final_point = y_test.lat_final_point.astype(float)

In [172]:
y_test = y_test.drop(['final_point'], axis=1)

In [174]:
y_train.to_csv('y_train_distance_sequence.csv', header=True, index=False)
y_test.to_csv('y_test_distance_sequence.csv', header=True, index=False)

### Configure Model

In [6]:
random_state = 20

In [7]:
X_train = pd.read_csv('X_train_distance_sequence.csv', header=0,index_col=False)
y_train = pd.read_csv('y_train_distance_sequence.csv', header=0,index_col=False)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)

In [9]:
multi_output_model = MultioutputModel(num_timesteps=15)

In [10]:
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [11]:
y_train

Unnamed: 0,lon_final_point,lat_final_point
0,-8.609418,41.148972
1,-8.612712,41.169357
2,-8.617572,41.143068
3,-8.605215,41.153616
4,-8.619597,41.168826
...,...,...
1093404,-8.606583,41.144670
1093405,-8.584398,41.168565
1093406,-8.631135,41.162778
1093407,-8.615880,41.145309


In [12]:
callback = EarlyStopping(monitor='loss',patience=3)

In [None]:
multi_output_model.train(X_train, {'y1_output': y_train.lon_final_point, 'y2_output': y_train.lat_final_point}, epochs=10, 
                         validation_data=(X_val, {'y1_output': y_val.lon_final_point, 'y2_output': y_val.lat_final_point}),
                         batch_size=120, callbacks=[callback])

Epoch 1/10
[1m9112/9112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 11ms/step - loss: 133.2710 - y1_output_loss: 6.2331 - y1_output_root_mean_squared_error: 2.0851 - y2_output_loss: 127.0359 - y2_output_root_mean_squared_error: 9.2819 - val_loss: 0.0028 - val_y1_output_loss: 0.0019 - val_y1_output_root_mean_squared_error: 0.0437 - val_y2_output_loss: 8.8524e-04 - val_y2_output_root_mean_squared_error: 0.0298
Epoch 2/10
[1m9112/9112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 9ms/step - loss: 0.0110 - y1_output_loss: 0.0020 - y1_output_root_mean_squared_error: 0.0448 - y2_output_loss: 0.0090 - y2_output_root_mean_squared_error: 0.0774 - val_loss: 0.0016 - val_y1_output_loss: 0.0011 - val_y1_output_root_mean_squared_error: 0.0325 - val_y2_output_loss: 5.7930e-04 - val_y2_output_root_mean_squared_error: 0.0241
Epoch 3/10
[1m9112/9112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 10ms/step - loss: 0.0093 - y1_output_loss: 0.0016 - y1_output_root_mean_squar