In [22]:
import os, sys
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import time
import json
import sklearn
import awswrangler as wr

In [23]:
is_sagemaker_notebook = False

In [24]:
if is_sagemaker_notebook:
    prefix = "/home/ec2-user/SageMaker"
else:
    prefix = "/Users/Q619505/PycharmProjects/personal-projects"

utils_path = os.path.join(f'{prefix}/ml-project-taxi-prediction/src/utils/')
pp_path = os.path.join(f'{prefix}/ml-project-taxi-prediction/src/preprocessing')

if utils_path not in sys.path:
    sys.path.append(utils_path)

if pp_path not in sys.path:
    sys.path.append(pp_path)

In [25]:
import utils

In [26]:
prefix_path = 's3://think-tank-casestudy/preprocessed_data'

In [27]:
n_cluster = 4000

In [28]:
if is_sagemaker_notebook:
    train_data = wr.s3.read_parquet(path=f'{prefix_path}/n_cluster_{n_cluster}/train_data_clustered.parquet')
    test_data = wr.s3.read_parquet(path=f'{prefix_path}/n_cluster_{n_cluster}/test_data_clustered.parquet')
else:
    train_data = pd.read_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/train_data.csv', header=0, index_col=False)
    test_data = pd.read_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/test_data.csv', header=0, index_col=False)

In [29]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366762 entries, 0 to 1366761
Data columns (total 15 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   trip_id                    1366762 non-null  int64  
 1   call_type                  1366762 non-null  object 
 2   origin_call                297184 non-null   float64
 3   origin_stand               687325 non-null   float64
 4   taxi_id                    1366762 non-null  int64  
 5   timestamp                  1366762 non-null  object 
 6   day_type                   1366762 non-null  object 
 7   polyline                   1366762 non-null  object 
 8   n_coordinate_points        1366762 non-null  int64  
 9   total_flight_time_minutes  1366762 non-null  float64
 10  start_point                1366762 non-null  object 
 11  dest_point                 1366762 non-null  object 
 12  final_point                1366762 non-null  object 
 13  total_distan

- CALL TYPE -> ONE_HOT ENCODING, no ordinal relationship
- WEATHER --> ONE HOT ENCODING, no ordinal relationship
- ORIGIN STAND --> Reduction of High cardinality + ONE HOT ENCODING
- MONTH/WEEK per year --> ONE HOT ENCODING or ORDINAL ENCODING

In [30]:
train_data = utils.extend_timestamps(train_data, 'timestamp')
test_data = utils.extend_timestamps(test_data, 'timestamp')

In [31]:
train_data = utils.reduce_high_cardinality(train_data, ['origin_stand'])
test_data = utils.reduce_high_cardinality(test_data, ['origin_stand'])

In [32]:
categories_oh = ['call_type','origin_stand_agg','year_month']

In [33]:
train_data.columns

Index(['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id',
       'timestamp', 'day_type', 'polyline', 'n_coordinate_points',
       'total_flight_time_minutes', 'start_point', 'dest_point', 'final_point',
       'total_distance_km', 'sequence', 'timestamp_month', 'timestamp_year',
       'year_month', 'origin_stand_agg'],
      dtype='object')

In [34]:
df_fenc_oh = utils.feature_encoding_oh(train_data, categories_oh)
train_data = pd.concat([train_data, df_fenc_oh],axis=1)

In [35]:
df_fenc_oh = utils.feature_encoding_oh(test_data, categories_oh)
test_data = pd.concat([test_data, df_fenc_oh],axis=1)

In [36]:
test_data, train_data = utils.add_binary_features(train_data, test_data)

In [37]:
assert(train_data.shape[1] == test_data.shape[1])

In [38]:
non_features = ['CALL_TYPE','ORIGIN_CALL','ORIGIN_STAND', 'START_POINT','DEST_POINT',
                'TIMESTAMP_MONTH','TIMESTAMP_DAY','TIMESTAMP_WEEK','TIMESTAMP_YEAR','YEAR_MONTH','ORIGIN_STAND_agg',
               'MISSING_DATA','WEATHER','TAXI_ID','TIMESTAMP_DT','TIMESTAMP']
non_features = [non_feature.lower() for non_feature in non_features]

In [39]:
train_data = train_data[[column_ for column_ in train_data.columns if column_ not in non_features]]
test_data = test_data[[column_ for column_ in train_data.columns if column_ not in non_features]]

In [40]:
#n_cluster = train_data.CLUSTER_LABEL.nunique()

In [41]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1366762 entries, 0 to 1366761
Data columns (total 65 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   trip_id                    1366762 non-null  int64  
 1   day_type                   1366762 non-null  object 
 2   polyline                   1366762 non-null  object 
 3   n_coordinate_points        1366762 non-null  int64  
 4   total_flight_time_minutes  1366762 non-null  float64
 5   final_point                1366762 non-null  object 
 6   total_distance_km          1366762 non-null  float64
 7   sequence                   1366762 non-null  object 
 8   2013_10                    1366762 non-null  float64
 9   2013_11                    1366762 non-null  float64
 10  2013_12                    1366762 non-null  float64
 11  2013_7                     1366762 non-null  float64
 12  2013_8                     1366762 non-null  float64
 13  2013_9      

In [42]:
if is_sagemaker_notebook:
    wr.s3.to_parquet(df=train_data, index=train_data.index, path=f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_train.parquet')
    wr.s3.to_parquet(df=test_data, index=test_data.index, path=f's3://think-tank-casestudy/features_engineered/n_cluster_{n_cluster}/feature_engineered_test.parquet')
else:
    train_data.to_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/train_data_encoded.csv', header=True, index=False)
    test_data.to_csv(f'{prefix}/ml-project-taxi-prediction/data/processed/test_data_encoded.csv', header=True, index=False)
    