# Use orca

In [1]:
import pandas as pd
from utils import Timer

def cutomizedCoordinationFix(df):
    df = df.assign(rev=df.dropoff_latitude<df.dropoff_longitude)
    idx = (df['rev'] == 1)
    df.loc[idx,['dropoff_longitude','dropoff_latitude']] = df.loc[idx,['dropoff_latitude','dropoff_longitude']].values
    df.loc[idx,['pickup_longitude','pickup_latitude']] = df.loc[idx,['pickup_latitude','pickup_longitude']].values
    df = df.drop(columns=['rev'])
    return df

def clean_df(df):    
    #reverse incorrectly assigned longitude/latitude values
    df = cutomizedCoordinationFix(df)
    df = df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]
    
    return df

In [2]:
from featuretools.primitives import IsInGeoBox
from woodwork.logical_types import Ordinal
import featuretools as ft
print(f"featuretools version is {ft.__version__}")

from featuretools.primitives import TransformPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double, LatLong, Datetime, Boolean

import numpy as np

def produce_featuretools_entityset(es, df):
    trip_logical_types = {
        'passenger_count': Ordinal(order=list(range(0, 10))), 
        'pickup_latlong': 'LatLong',
        'dropoff_latlong': 'LatLong',
    }

    es.add_dataframe(dataframe_name="trips",
                     dataframe=df,
                     index="id",
                     time_index='pickup_datetime',
                     logical_types=trip_logical_types)

    return es


class Bearing(TransformPrimitive):
    name = "bearing"
    input_types = [ColumnSchema(logical_type=LatLong), ColumnSchema(logical_type=LatLong)]
    return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'})
    number_output_features = 1
    commutative=True
    def get_function(self):
        def bearing(latlong1, latlong2):
            lat1 = np.array([x[0] for x in latlong1])
            lon1 = np.array([x[1] for x in latlong1])
            lat2 = np.array([x[0] for x in latlong2])
            lon2 = np.array([x[1] for x in latlong2])
            delta_lon = np.radians(lon2 - lon1)
            lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
            x = np.cos(lat2) * np.sin(delta_lon)
            y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(delta_lon)
            return np.degrees(np.arctan2(x, y))
        return bearing
    
class DistanceToLocation(TransformPrimitive):
    name = "distance_to_location"
    input_types = [ColumnSchema(logical_type=LatLong)]
    return_type = ColumnSchema(logical_type=Double, semantic_tags={'numeric'})
    number_output_features = 1
    commutative=True
    def __init__(self, point=(0, 0)):
        self.point = point
        self.lat = point[0]
        self.lon = point[1]
        
    def get_function(self):
        def distance_to_location(latlong):
            lat = np.array([x[0] for x in latlong])
            lon = np.array([x[1] for x in latlong])
            tgt_lat = len(lat) * self.lat
            tgt_lon = len(lon) * self.lon
            return self.sphere_dist(tgt_lat, tgt_lon, lat, lon)
        return distance_to_location
    
    def sphere_dist(self, lat1, lon1, lat2, lon2):
        """
        Return distance along great radius between pickup and dropoff coordinates.
        """
        #Define earth radius (km)
        R_earth = 6371
        #Convert degrees to radians
        lat1, lon1, lat2, lon2 = map(np.radians,[lat1, lon1, lat2, lon2])
        #Compute distances along lat, lon dimensions
        dlat = lat2 - lat1
        dlon = lon2 - lon1

        #Compute haversine distance
        a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
        return 2 * R_earth * np.arcsin(np.sqrt(a))


def get_coordination(df):
    df["pickup_latlong"] = df[['pickup_latitude', 'pickup_longitude']].apply(tuple, axis=1)
    df["dropoff_latlong"] = df[['dropoff_latitude', 'dropoff_longitude']].apply(tuple, axis=1)
    df = df.drop(["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"], axis = 1)
    return df

def modelling_features(df, feature_list = None, features_only = False):
    df = get_coordination(df)
    print(df.dtypes)

    es = ft.EntitySet("nyc_taxi_fare")
    es = produce_featuretools_entityset(es, df)
    
    cutoff_time = es['trips'][['id', 'pickup_datetime']]
    
    if feature_list:
        df = ft.calculate_feature_matrix(feature_list, entityset=es, cutoff_time=cutoff_time, verbose=True)
        return df, es, feature_list
    
    # airport coordination
    coordination_dicts = {
        "jfk_coord": (40.639722, -73.778889),
        "ewr_coord": (40.6925, -74.168611),
        "lga_coord": (40.77725, -73.872611),
        "sol_coord": (40.6892,-74.0445), # Statue of Liberty
        "nyc_coord": (40.7141667,-74.0063889) 
    }
    
    trans_primitives = ["day", "year", "month", "weekday", "haversine", "hour", "is_weekend", "is_working_hours", "part_of_day"]
    trans_primitives += ["cityblock_distance", Bearing,
                         IsInGeoBox((40.62, -73.85), (40.70, -73.75)),
                         IsInGeoBox((40.70, -73.97), (40.77, -73.9))]
    trans_primitives += [DistanceToLocation(x) for n, x in coordination_dicts.items()]

    # calculate feature_matrix using deep feature synthesis
    
    ret = ft.dfs(entityset=es,
                      target_dataframe_name="trips",
                      trans_primitives=trans_primitives,
                      verbose=True,
                      cutoff_time=cutoff_time,
                      approximate='36d',
                      max_depth=3,
                      max_features=40, 
                      features_only = features_only)
    if features_only:
        features = ret
    else:
        features = ret[1]
        df = ret[0]
        #df_encoded, features_encoded = ft.encode_features(df, features)
    
    return df




featuretools version is 1.18.0


In [3]:
import os
import pandas as pd
from utils import Timer
from bigdl.orca import init_orca_context, stop_orca_context, OrcaContext
from bigdl.orca.data.pandas import read_csv
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

#start spark by orca
OrcaContext.log_output = True
init_orca_context(cluster_mode="local", cores=8, memory="40g", init_ray_on_spark=True)  # run in local mode

cols = [
    'fare_amount', 'pickup_datetime','pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
]
with Timer(f"Load train full"):
    train = read_csv("../data/train.csv", usecols=cols)

with Timer("Data Wrangling for train"):
    train = train.transform_shard(clean_df)
    
with Timer("Feature Engineering by featuretools"):
    train = train.transform_shard(modelling_features)

with Timer("Data Wrangling for train"):
    pdf = pd.concat(train.collect())

#stop spark by orca
stop_orca_context()

Initializing orca context
Current pyspark location is : /home/spark-3.2.1-bin-hadoop3.2/python/pyspark/__init__.py
Start to getOrCreate SparkContext
pyspark_submit_args is:  --driver-class-path /usr/local/lib/python3.8/dist-packages/bigdl/share/core/lib/all-2.1.0.jar:/usr/local/lib/python3.8/dist-packages/bigdl/share/dllib/lib/bigdl-dllib-spark_3.1.2-2.1.0-jar-with-dependencies.jar:/usr/local/lib/python3.8/dist-packages/bigdl/share/orca/lib/bigdl-orca-spark_3.1.2-2.1.0-jar-with-dependencies.jar pyspark-shell 


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2022-12-07 22:16:27 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-12-07 22:16:27 WARN  SparkConf:69 - Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
2022-12-07 22:16:29,864 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
2022-12-07 22:16:30,017 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
2022-12-07 22:16:30,018 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
2022-12-07 22:16:30,019 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
22-12-07 22:16:30 [Thread-4] INFO  Engine$:121 - Auto detect executor number and executor cores number
22-12-07 22:16:30 [Thread-4] INFO  Engine$:123 - Executor number is 1 and executor cores number is 8



User settings:

   KMP_AFFINITY=granularity=fine,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1
   OMP_NUM_THREADS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=384
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_HAND_THREAD=false
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_FORKJOIN_FRAMES=true
   KMP_FORKJOIN_FRAMES_MODE=3
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_INIT_WAIT=2048
   KMP_ITT_PREPARE_DELAY=0
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_

22-12-07 22:16:30 [Thread-4] INFO  ThreadPool$:95 - Set mkl threads to 1 on thread 29
2022-12-07 22:16:30 WARN  SparkContext:69 - Using an existing SparkContext; some configuration may not take effect.
22-12-07 22:16:30 [Thread-4] INFO  Engine$:456 - Find existing spark context. Checking the spark conf...
cls.getname: com.intel.analytics.bigdl.dllib.utils.python.api.Sample
BigDLBasePickler registering: bigdl.dllib.utils.common  Sample
cls.getname: com.intel.analytics.bigdl.dllib.utils.python.api.EvaluatedResult
BigDLBasePickler registering: bigdl.dllib.utils.common  EvaluatedResult
cls.getname: com.intel.analytics.bigdl.dllib.utils.python.api.JTensor
BigDLBasePickler registering: bigdl.dllib.utils.common  JTensor
cls.getname: com.intel.analytics.bigdl.dllib.utils.python.api.JActivity
BigDLBasePickler registering: bigdl.dllib.utils.common  JActivity
Successfully got a SparkContext


2022-12-07 22:16:33,947	INFO services.py:1338 -- View the Ray dashboard at [1m[32mhttp://10.0.2.14:8265[39m[22m


{'node_ip_address': '10.0.2.14', 'raylet_ip_address': '10.0.2.14', 'redis_address': '10.0.2.14:6379', 'object_store_address': '/tmp/ray/session_2022-12-07_22-16-31_205687_232277/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-12-07_22-16-31_205687_232277/sockets/raylet', 'webui_url': '10.0.2.14:8265', 'session_dir': '/tmp/ray/session_2022-12-07_22-16-31_205687_232277', 'metrics_export_port': 60044, 'node_id': '2819d3f8b36f1213f685f73dac03e91c658fdb20a4b145485bf09a60'}
2022-12-07 22:16:34,793 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
2022-12-07 22:16:34,795 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
2022-12-07 22:16:34,798 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
2022-12-07 22:16:34,800 Thread-4 WARN The bufferSize is set to 4000 but bufferedIo is false: false
22-12-07 22:16:34 [Thread-4] INFO  Engine$:121 - Auto detect executor number and executor cores number
22-12-07

[2m[33m(raylet)[0m   if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"):
                                                                                

Load train full took 60.8455228311941 sec


                                                                                

Data Wrangling for train took 13.091192059218884 sec


fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      object
dropoff_latlong     object
dtype: object
fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      object
dropoff_latlong     object
dtype: object
fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      object
dropoff_latlong     object
dtype: object
fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      object
dropoff_latlong     object
dtype: object
fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      object
dropoff_latlong     object
dtype: object
fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      object
dropoff_latlong     object
dtype: object
fare_amount        float64
pickup_datetime     object
passenger_count      int32
pickup_latlong      objec

Feature Engineering by featuretools took 436.3741386849433 sec


                                                                                

Data Wrangling for train took 59.80269160028547 sec
Stopping orca context


In [26]:
pdf

Unnamed: 0_level_0,fare_amount,passenger_count,"BEARING(dropoff_latlong, pickup_latlong)","CITYBLOCK_DISTANCE(dropoff_latlong, pickup_latlong)",DAY(pickup_datetime),"DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.639722, -73.778889))","DISTANCE_TO_LOCATION(pickup_latlong, point=(40.639722, -73.778889))","DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.6892, -74.0445))","DISTANCE_TO_LOCATION(pickup_latlong, point=(40.6892, -74.0445))","DISTANCE_TO_LOCATION(dropoff_latlong, point=(40.6925, -74.168611))",...,"IS_IN_GEOBOX(dropoff_latlong, point1=(40.62, -73.85), point2=(40.7, -73.75))","IS_IN_GEOBOX(pickup_latlong, point1=(40.62, -73.85), point2=(40.7, -73.75))","IS_IN_GEOBOX(dropoff_latlong, point1=(40.7, -73.97), point2=(40.77, -73.9))","IS_IN_GEOBOX(pickup_latlong, point1=(40.7, -73.97), point2=(40.77, -73.9))",IS_WEEKEND(pickup_datetime),IS_WORKING_HOURS(pickup_datetime),MONTH(pickup_datetime),PART_OF_DAY(pickup_datetime),WEEKDAY(pickup_datetime),YEAR(pickup_datetime)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862908,15.00,1,35.967488,4.439169,1,15226.024029,15221.264881,6192.926729,6188.639503,7854.988394,...,False,False,False,False,False,False,1,midnight,3,2009
647957,5.80,2,11.803491,0.938679,1,15229.439753,15228.547915,6198.147133,6196.885913,7859.840563,...,False,False,False,False,False,False,1,midnight,3,2009
741816,29.40,2,-129.884312,13.297964,1,15208.621355,15223.690776,6182.006263,6192.347656,7848.523401,...,False,False,False,False,False,False,1,midnight,3,2009
730952,7.40,1,159.865822,1.555632,1,15223.379600,15223.798394,6190.811736,6192.606946,7853.475891,...,False,False,False,False,False,False,1,midnight,3,2009
1274780,5.40,1,-120.636784,1.272766,1,15225.109147,15226.603479,6194.453510,6195.280426,7857.095719,...,False,False,True,False,False,False,1,midnight,3,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275668,39.54,1,78.391508,7.528177,30,11405.963569,11416.105616,6154.341751,6151.947603,17350.019168,...,False,False,False,False,False,False,6,midnight,1,2015
459809,29.00,1,48.111879,11.718457,30,11403.409720,11415.837980,6148.509489,6152.363636,17353.344301,...,False,False,False,False,False,False,6,midnight,1,2015
561697,15.00,3,-123.179943,1.798940,30,11405.809869,11403.769736,6152.995558,6152.702806,17351.019709,...,False,False,False,False,False,False,6,midnight,1,2015
326027,14.50,1,-34.356616,5.334359,30,11406.654428,11405.168338,6145.740580,6151.835722,17357.300474,...,False,False,False,False,False,False,6,midnight,1,2015


In [13]:
train.__dict__

{'rdd': PythonRDD[28] at RDD at PythonRDD.scala:53,
 'user_cached': False,
 'eager': True,
 'type': {}}

In [1]:
stop_orca_context()

NameError: name 'stop_orca_context' is not defined