### Model training

The filtered, calculated and encoded features can now be trained with appropriate models.

The following approaches are considered:
- Multi classification problem with clustered data
- Mutli regression problem with two outputs (longitude/latitude)

In terms of the data we have the following approaches:
- variable sequence length - can take all points in POLYLINE in consideration, mask sequence if necessary
- fixed sequence length - take 10 points from beginning of POLYLINE and 10 points from end of polyline 


Algorithms:
- Long term short term NN (multi-class classification and regression)
    - able to handle variable sequence length, therefore the total trip POLYLINE can be used
- Random forest(regression and classification) 
    - can handle outliers well as dataset still contains outliers
    - runs efficiently on large data set
    
Metrics:
- Classification of clusters: AUC + Avg distance of last point to cluster center
- Regression: MAPE + Avg distance of last point to cluster center


In [2]:
import os,sys
import pandas as pd 
import numpy as np 
import json
from sklearn.ensemble import RandomForestClassifier

In [3]:
is_sagemaker_notebook = False

In [4]:
if is_sagemaker_notebook:
    prefix = "/home/ec2-user/SageMaker"
else:
        prefix = "/Users/Q619505/PycharmProjects/personal-projects/taxi-destination-prediction"

utils_path = os.path.join(f'{prefix}/src/utils/')
pp_path = os.path.join(f'{prefix}/src/preprocessing/')
model_path = os.path.join(f'{prefix}/src/modelling/')

if utils_path not in sys.path:
    sys.path.append(utils_path)

if pp_path not in sys.path:
    sys.path.append(pp_path)

if model_path not in sys.path:
    sys.path.append(model_path)

In [5]:
from data_cleaning import *
from geo_spatial import *
from model import *

In [6]:
n_cluster = 4000

In [7]:
os.getcwd()

'/Users/Q619505/PycharmProjects/personal-projects/taxi-destination-prediction/notebooks'

In [8]:
if is_sagemaker_notebook:
    train_data = pd.read_parquet(f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_train.parquet')
    test_data = pd.read_parquet(f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_test.parquet')
else:
    train_data = pd.read_csv(f'{prefix}/data/processed/train_data_encoded.csv', header=0,index_col=False)
    test_data = pd.read_csv(f'{prefix}/data/processed/test_data_encoded.csv', header=0, index_col=False)
    

### 1a) Cluster Prediction - Multi class classifier Random forest

In [9]:
non_feature_columns_config_1_list = ['trip_id', 'total_distance_km', 'total_flight_time_minutes', 
'start_point_lon', 'start_point_lat', 'dest_point_lon', 'dest_point_lat', 'cluster_label', 'sequence', 'final_point', 'polyline',
                                    'center_lon', 'index', 'center_lat', 'day_type']

In [10]:
features_config_1 = [column_ for column_ in train_data.columns if not column_ in non_feature_columns_config_1_list]

In [11]:
label_config_1 = ['cluster_label']

In [12]:
train_data_reduced = train_data.sample(20000, axis=0)

In [13]:
#train_data = create_fix_length_sequences(train_data, 10)
#test_data = create_fix_length_sequences(test_data, 10)

In [14]:
X_train = train_data_reduced[features_config_1].to_numpy()
y_train = train_data_reduced[label_config_1]

In [15]:
X_test = test_data[features_config_1].to_numpy()
y_test = test_data[label_config_1]

In [16]:
#start_sequence_train = pd.DataFrame(train_data_reduced.START_SEQUENCE.tolist()).to_numpy()
#stop_sequence_train = pd.DataFrame(train_data_reduced.STOP_SEQUENCE.tolist()).to_numpy()

#start_sequence_test = pd.DataFrame(test_data.START_SEQUENCE.tolist()).to_numpy()
#stop_sequence_test = pd.DataFrame(test_data.STOP_SEQUENCE.tolist()).to_numpy()

In [17]:
#X_train = np.concatenate((X_train, start_sequence_train, stop_sequence_train), axis=1).astype(float)
#X_test = np.concatenate((X_test, start_sequence_test, stop_sequence_test), axis=1).astype(float)

In [18]:
assert(X_train.shape[1] == X_test.shape[1])

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=20, random_state=0)
clf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [18]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(min_samples_leaf=0.05, random_state=0)
regr.fit(X_train, y_train)

In [19]:
y_pred = regr.predict(X_test)

In [20]:
df_pred = pd.DataFrame(y_pred, columns = ['LON','LAT'])

In [21]:
df_pred = pd.concat([y_test, df_pred],axis=1)

In [22]:
df_pred['h_distance'] = df_pred.apply(lambda row: haversine_distance(lat1=row.DEST_POINT_LAT,
                                                                     lat2=row.LAT,
                                                                     lon1=row.DEST_POINT_LON,
                                                                     lon2=row.LON), axis=1)

In [25]:
df_pred.h_distance.describe()

count    269.000000
mean       2.103287
std        7.648067
min        0.036385
25%        0.667274
50%        1.035733
75%        1.487928
max      114.168664
Name: h_distance, dtype: float64

### Multioutput NN model - 10 point sequence

In [27]:
train_data = train_data[['trip_id', 'sequence', 'final_point', 'polyline', 'n_coordinate_points', 'total_distance_km','total_flight_time_minutes']]
test_data = test_data[['trip_id', 'sequence', 'final_point','polyline',  'n_coordinate_points', 'total_distance_km','total_flight_time_minutes']]

In [28]:
train_data = convert_polyline_to_geojson_format(data=train_data, name_column='polyline')
test_data = convert_polyline_to_geojson_format(data=test_data, name_column='polyline')

In [29]:
train_data = create_fix_length_sequences(train_data, 10, start_sequence_column='start_point', stop_sequence_column='dest_point', sequence_column='sequence')
test_data = create_fix_length_sequences(test_data, 10,start_sequence_column='start_point', stop_sequence_column='dest_point', sequence_column='sequence')

In [30]:
def _normalize_coordinates(coordinates):
    return (coordinates[0] + 90)/ 180, (coordinates[1] + 180)/ 360

In [31]:
train_data['normalized_polyline'] = train_data['polyline']\
.apply(lambda trip: np.array(list(map(_normalize_coordinates,trip)))[0:10])

test_data['normalized_polyline'] = test_data['polyline']\
.apply(lambda trip: np.array(list(map(_normalize_coordinates,trip)))[0:10])

In [32]:
train_data['dest_point_lat_norm'] = train_data['final_point']\
.apply(lambda coordinates: _normalize_coordinates(convert_string_to_geojson(coordinates))[0]) 
train_data['dest_point_lon_norm'] = train_data['final_point']\
.apply(lambda coordinates: _normalize_coordinates(convert_string_to_geojson(coordinates))[1]) 

In [33]:
test_data['dest_point_lat_norm'] = test_data['final_point']\
.apply(lambda coordinates: _normalize_coordinates(convert_string_to_geojson(coordinates))[0]) 
test_data['dest_point_lon_norm'] = test_data['final_point']\
.apply(lambda coordinates: _normalize_coordinates(convert_string_to_geojson(coordinates))[1]) 

In [51]:
X_train.normalized_polyline

0          [[0.45211864999999996, 0.6142817], [0.45211945...
1          [[0.45200085, 0.61433285], [0.4519980500000000...
2          [[0.45215019999999995, 0.6142787749999999], [0...
3          [[0.45236289999999996, 0.614310975], [0.452362...
4          [[0.4519667, 0.61439025], [0.45196695, 0.61439...
                                 ...                        
1366757    [[0.45230165, 0.614301575], [0.45230145, 0.614...
1366758    [[0.45215295, 0.6142945], [0.45215285, 0.61429...
1366759    [[0.45216589999999995, 0.6142801250000001], [0...
1366760    [[0.4520516, 0.614319125], [0.4520515, 0.61431...
1366761    [[0.4521359, 0.6142795249999999], [0.45213655,...
Name: normalized_polyline, Length: 1366762, dtype: object

In [65]:
X_train.normalized_polyline = X_train.normalized_polyline.apply(lambda sequence : sequence.reshape((1,10,2)))
X_test.normalized_polyline = X_test.normalized_polyline.apply(lambda sequence : sequence.reshape((1,10,2)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [69]:
X_train.normalized_polyline.to_numpy().shape

(1366762,)

In [34]:
X_train = train_data[['normalized_polyline']]
X_test = test_data[['normalized_polyline']]
y_train = train_data[['dest_point_lat_norm', 'dest_point_lon_norm']]
y_test =  test_data[['dest_point_lat_norm', 'dest_point_lon_norm']]

In [36]:
X_train.to_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/X_train.csv', header=True, index=False)
X_test.to_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/X_test.csv', header=True, index=False)
y_train.to_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/y_train.csv', header=True, index=False)
y_test.to_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/y_test.csv', header=True, index=False)

### 1b) Random Forest - Multiclass - Variable sequence

###  2) NN with LSTM - full sequence only - no tabular data

In [None]:
features_config = ['SEQUENCE']
label_config = ['CLUSTER_LABEL']

In [None]:
X_train = train_data[features_config]
y_train = train_data[label_config]
X_test = test_data[features_config]
y_test = test_data[label_config]

In [None]:
#max length of sequence
if train_data.N_COORDINATE_POINTS.max() > test_data.N_COORDINATE_POINTS.max():
    max_sequence= train_data.N_COORDINATE_POINTS.max()
else:
    max_sequence = test_data.N_COORDINATE_POINTS.max()

In [None]:
def create_lstm_shape(sequence_,max_sequence):
    sequence_array = np.array(sequence_)
    zeros_ = np.zeros(2*max_sequence-len(sequence_array))
    return np.append(sequence_array,zeros_).reshape(1,max_sequence,2)

In [None]:
X_test['SEQUENCE'] = X_test.SEQUENCE.apply(lambda sequence_: create_lstm_shape(sequence_,max_sequence))
X_train['SEQUENCE'] = X_train.SEQUENCE.apply(lambda sequence_: create_lstm_shape(sequence_,max_sequence))

In [None]:
lstm_model = Sequential()
lstm_model.add(tensorflow.compat.v1.keras.layers.CuDNNLSTM(200, input_shape=(612,2)))
lstm_model.add(Dense(4000, activation='softmax'))
print(lstm_model.summary())

In [None]:
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['AUC'])

In [None]:
X_train = np.array([np.array(val) for val in X_train])

In [None]:
X_train

In [None]:
history=lstm_model.fit(X_train,
                       y_train.astype(np.float32), 
                       validation_split=0.2, epochs=100, batch_size=448, verbose=2)

### 2b) NN - fixed sequence

In [None]:
df_sequence_train_start = pd.DataFrame(train_data.START_SEQUENCE.tolist()).fillna(2000)
df_sequence_train_stop = pd.DataFrame(train_data.STOP_SEQUENCE.tolist()).fillna(2000)

df_sequence_test_start = pd.DataFrame(test_data.START_SEQUENCE.tolist()).fillna(2000)
df_sequence_test_stop = pd.DataFrame(test_data.STOP_SEQUENCE.tolist()).fillna(2000)

In [None]:
df_sequence_train_start.shape

In [None]:
#fill nas with arbitrary large number to mask later
#df_sequence_train = pd.DataFrame(train_data.SEQUENCE.tolist()).fillna(2000).to_numpy()
#df_sequence_test = pd.DataFrame(test_data.SEQUENCE.tolist()).fillna(2000).to_numpy()

In [None]:
features_config_2 = ['N_COORDINATE_POINTS','TOTAL_DISTANCE_KM','2013_10',
       '2013_11', '2013_12', '2013_7', '2013_8', '2013_9', '2014_1', '2014_2',
       '2014_3', '2014_4', '2014_5', '2014_6', '2014_7', '10.0', '12.0',
       '13.0', '14.0', '15.0', '18.0', '20.0', '21.0', '23.0', '25.0', '26.0',
       '27.0', '28.0', '33.0', '34.0', '35.0', '36.0', '38.0', '40.0', '42.0',
       '52.0', '53.0', '54.0', '56.0', '57.0', '58.0', '6.0', '60.0', '61.0',
       '63.0', '7.0', '9.0', 'OTHER', 'Cloudy', 'Foggy', 'Rainy', 'Sunny',
       'Windy', 'A', 'B', 'C', '16.0', '2014_10', '2014_11', '2014_12',
       '2014_8', '2014_9', '47.0', '49.0']
label_config_2 = ['CLUSTER_LABEL']

In [None]:
X_train = train_data[features_config_2].astype(float)
X_test = test_data[features_config_2].astype(float)

In [None]:
X_train = pd.concat([df_sequence_train_start,df_sequence_train_stop, X_train], axis=1)
X_test = pd.concat([df_sequence_test_start,df_sequence_test_stop, X_test], axis=1)

In [None]:
y_train = train_data[label_config_2]
y_test =  test_data[label_config_2]

In [None]:
model = Sequential()
model.add(Input(shape=(X_train.shape[1])))
model.add(tensorflow.keras.layers.Masking(mask_value=2000))
model.add(LSTM(200, activation='relu'))
model.add(Dense(4000, activation='softmax'))
print(model.summary())

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['AUC'])
          
#X_pred_train = model.fit(X_train)