### Model training

The filtered, calculated and encoded features can now be trained with appropriate models.

The following approaches are considered:
- Multi classification problem with clustered data
- Mutli regression problem with two outputs (longitude/latitude)

In terms of the data we have the following approaches:
- variable sequence length - can take all points in POLYLINE in consideration, mask sequence if necessary
- fixed sequence length - take 10 points from beginning of POLYLINE and 10 points from end of polyline 


Algorithms:
- Long term short term NN (multi-class classification and regression)
    - able to handle variable sequence length, therefore the total trip POLYLINE can be used
- Random forest(regression and classification) 
    - can handle outliers well as dataset still contains outliers
    - runs efficiently on large data set
    
Metrics:
- Classification of clusters: AUC + Avg distance of last point to cluster center
- Regression: MAPE + Avg distance of last point to cluster center


In [1]:
%pip install fastparquet tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow
from keras import Sequential
from keras.layers import LSTM, Softmax, Dense, Dropout, Flatten, Embedding, Input,Concatenate
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

2023-11-25 18:25:56.432449: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-25 18:25:56.432490: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-25 18:25:56.433615: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-25 18:25:56.440715: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pandas.core.computation.check import N

In [3]:
import os,sys
import pandas as pd 
import numpy as np 
import json
import dask.array as da
from sklearn.ensemble import RandomForestClassifier

In [4]:
is_sagemaker_notebook = True

In [5]:
if is_sagemaker_notebook:
    prefix = "/home/ec2-user/SageMaker"
else:
    prefix = "/Users/Q619505/PycharmProjects"

utils_path = os.path.join(f'{prefix}/ml-project-taxi-prediction/src/utils/')
pp_path = os.path.join(f'{prefix}/ml-project-taxi-prediction/src/preprocessing/')

if utils_path not in sys.path:
    sys.path.append(utils_path)

if pp_path not in sys.path:
    sys.path.append(pp_path)

In [6]:
from data_cleaning import *
from geo_spatial import haversine_distance

In [7]:
n_cluster = 4000

In [8]:
train_data = pd.read_parquet(f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_train.parquet')
test_data = pd.read_parquet(f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_test.parquet')

### 1a) Random Forest - Multiclass - Fix sequence

In [9]:
non_feature_columns_config_1_list = ['TRIP_ID', 'TOTAL_DISTANCE_KM', 'TOTAL_FLIGHT_TIME_MINUTES', 'CENTER_LON', 'CENTER_LAT', 'index',
'START_POINT_LON', 'START_POINT_LAT', 'DEST_POINT_LON', 'DEST_POINT_LAT', 'START_SEQUENCE', 'STOP_SEQUENCE', 'CLUSTER_LABEL', 'SEQUENCE']

In [10]:
train_data = create_fix_length_sequences(train_data, 10)
test_data = create_fix_length_sequences(test_data, 10)

In [11]:
features_config_1 = [column_ for column_ in train_data.columns if not column_ in non_feature_columns_config_1_list]
label_config_1 = ['DEST_POINT_LON', 'DEST_POINT_LAT']

In [12]:
train_data_reduced = train_data.sample(300000, axis=0)

In [13]:
X_train = train_data_reduced[features_config_1].to_numpy()
y_train = train_data_reduced[label_config_1]

In [14]:
X_test = test_data[features_config_1].to_numpy()
y_test = test_data[label_config_1]

In [15]:
start_sequence_train = pd.DataFrame(train_data_reduced.START_SEQUENCE.tolist()).to_numpy()
stop_sequence_train = pd.DataFrame(train_data_reduced.STOP_SEQUENCE.tolist()).to_numpy()

start_sequence_test = pd.DataFrame(test_data.START_SEQUENCE.tolist()).to_numpy()
stop_sequence_test = pd.DataFrame(test_data.STOP_SEQUENCE.tolist()).to_numpy()

In [16]:
X_train = np.concatenate((X_train, start_sequence_train, stop_sequence_train), axis=1).astype(float)
X_test = np.concatenate((X_test, start_sequence_test, stop_sequence_test), axis=1).astype(float)

In [17]:
assert(X_train.shape[1] == X_test.shape[1])

In [18]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(min_samples_leaf=0.05, random_state=0)
regr.fit(X_train, y_train)

In [19]:
y_pred = regr.predict(X_test)

In [20]:
df_pred = pd.DataFrame(y_pred, columns = ['LON','LAT'])

In [21]:
df_pred = pd.concat([y_test, df_pred],axis=1)

In [22]:
df_pred['h_distance'] = df_pred.apply(lambda row: haversine_distance(lat1=row.DEST_POINT_LAT,
                                                                     lat2=row.LAT,
                                                                     lon1=row.DEST_POINT_LON,
                                                                     lon2=row.LON), axis=1)

In [25]:
df_pred.h_distance.describe()

count    269.000000
mean       2.103287
std        7.648067
min        0.036385
25%        0.667274
50%        1.035733
75%        1.487928
max      114.168664
Name: h_distance, dtype: float64

### 1b) Random Forest - Multiclass - Variable sequence

###  2) NN with LSTM - full sequence only - no tabular data

In [24]:
import preprocessing

ModuleNotFoundError: No module named 'preprocessing'

In [None]:
features_config = ['SEQUENCE']
label_config = ['CLUSTER_LABEL']

In [None]:
X_train = train_data[features_config]
y_train = train_data[label_config]
X_test = test_data[features_config]
y_test = test_data[label_config]

In [None]:
#max length of sequence
if train_data.N_COORDINATE_POINTS.max() > test_data.N_COORDINATE_POINTS.max():
    max_sequence= train_data.N_COORDINATE_POINTS.max()
else:
    max_sequence = test_data.N_COORDINATE_POINTS.max()

In [None]:
def create_lstm_shape(sequence_,max_sequence):
    sequence_array = np.array(sequence_)
    zeros_ = np.zeros(2*max_sequence-len(sequence_array))
    return np.append(sequence_array,zeros_).reshape(1,max_sequence,2)

In [None]:
X_test['SEQUENCE'] = X_test.SEQUENCE.apply(lambda sequence_: create_lstm_shape(sequence_,max_sequence))
X_train['SEQUENCE'] = X_train.SEQUENCE.apply(lambda sequence_: create_lstm_shape(sequence_,max_sequence))

In [None]:
lstm_model = Sequential()
lstm_model.add(tensorflow.compat.v1.keras.layers.CuDNNLSTM(200, input_shape=(612,2)))
lstm_model.add(Dense(4000, activation='softmax'))
print(lstm_model.summary())

In [None]:
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['AUC'])

In [None]:
X_train = np.array([np.array(val) for val in X_train])

In [None]:
X_train

In [None]:
history=lstm_model.fit(X_train,
                       y_train.astype(np.float32), 
                       validation_split=0.2, epochs=100, batch_size=448, verbose=2)

### 2b) NN - fixed sequence

In [None]:
df_sequence_train_start = pd.DataFrame(train_data.START_SEQUENCE.tolist()).fillna(2000)
df_sequence_train_stop = pd.DataFrame(train_data.STOP_SEQUENCE.tolist()).fillna(2000)

df_sequence_test_start = pd.DataFrame(test_data.START_SEQUENCE.tolist()).fillna(2000)
df_sequence_test_stop = pd.DataFrame(test_data.STOP_SEQUENCE.tolist()).fillna(2000)

In [None]:
df_sequence_train_start.shape

In [None]:
#fill nas with arbitrary large number to mask later
#df_sequence_train = pd.DataFrame(train_data.SEQUENCE.tolist()).fillna(2000).to_numpy()
#df_sequence_test = pd.DataFrame(test_data.SEQUENCE.tolist()).fillna(2000).to_numpy()

In [None]:
features_config_2 = ['N_COORDINATE_POINTS','TOTAL_DISTANCE_KM','2013_10',
       '2013_11', '2013_12', '2013_7', '2013_8', '2013_9', '2014_1', '2014_2',
       '2014_3', '2014_4', '2014_5', '2014_6', '2014_7', '10.0', '12.0',
       '13.0', '14.0', '15.0', '18.0', '20.0', '21.0', '23.0', '25.0', '26.0',
       '27.0', '28.0', '33.0', '34.0', '35.0', '36.0', '38.0', '40.0', '42.0',
       '52.0', '53.0', '54.0', '56.0', '57.0', '58.0', '6.0', '60.0', '61.0',
       '63.0', '7.0', '9.0', 'OTHER', 'Cloudy', 'Foggy', 'Rainy', 'Sunny',
       'Windy', 'A', 'B', 'C', '16.0', '2014_10', '2014_11', '2014_12',
       '2014_8', '2014_9', '47.0', '49.0']
label_config_2 = ['CLUSTER_LABEL']

In [None]:
X_train = train_data[features_config_2].astype(float)
X_test = test_data[features_config_2].astype(float)

In [None]:
X_train = pd.concat([df_sequence_train_start,df_sequence_train_stop, X_train], axis=1)
X_test = pd.concat([df_sequence_test_start,df_sequence_test_stop, X_test], axis=1)

In [None]:
y_train = train_data[label_config_2]
y_test =  test_data[label_config_2]

In [None]:
model = Sequential()
model.add(Input(shape=(X_train.shape[1])))
model.add(tensorflow.keras.layers.Masking(mask_value=2000))
model.add(LSTM(200, activation='relu'))
model.add(Dense(4000, activation='softmax'))
print(model.summary())

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['AUC'])
          
#X_pred_train = model.fit(X_train)