In [1]:
!squeue -u aschade

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)


In [2]:
!scancel -u aschade

In [1]:
from logging import log
from os.path import exists
from os import makedirs
from glob import glob
from tqdm import tqdm
from retrying import retry
from time import time, sleep
import shutil

import matplotlib.pyplot as plt
import matplotlib.dates as matdates
import numpy as np
import pandas as pd
pd.options.display.max_rows = 150
pd.options.display.max_columns = 50
import seaborn as sns
sns.set_style("whitegrid")

import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
# dask.config.set({
#     'distributed.workers.memory.spill':          None, 
#     'distributed.scheduler.allowed-failures':    10,
# })

########################################################################################################################
########################################################################################################################
########################################################################################################################

def partialReadFile(file, start, end):
    with open(file) as f:
        for i, line in enumerate(f):
            if i in range(start, end):
                print(f'{i:10} {line.strip()}')
            if i > end: 
                break
                
def getCleanDDFfromCSV(path, dt, scheduler=None):
    
    ############################## ridding of superfluous cols ###############################
    ddf = dd.read_csv(path)
    for col in dt.copy().keys():
        if col not in ddf.columns:
            del dt[col]


    ###################################### converters ########################################
    def coerceToFloat(something):
        try: return float(something)
        except ValueError: return np.nan

    def coerceToInt(something):
        try: return int(float(something))           # tries to convert string to int in decimal number system, gets confused by having to parse the comma
        except ValueError: return -1

    def coerceToStr(something):
        try: return str(something)
        except ValueError: return 'nan'

    floatColumns = [column for column, dtype in dt.items() if dtype == 'float']
    intColumns = [column for column, dtype in dt.items() if dtype == 'int']
    strColumns = [column for column, dtype in dt.items() if dtype == 'str']

    converters = {}
    converters.update(dict.fromkeys(floatColumns, coerceToFloat))
    converters.update(dict.fromkeys(intColumns, coerceToInt))
    converters.update(dict.fromkeys(strColumns, coerceToStr))


    ##################################### point to data #########################################
    ddf = dd.read_csv(

        ################# actual data stuff #################
        path, 
        usecols=dt.keys(),
        converters=converters,
        keep_default_na=True,
        

        ################# auxiliary fixes #################
        encoding='utf8',                #'utf8', #'unicode_escape', 'latin-1'
        encoding_errors='replace',      
        on_bad_lines='skip',
        # low_memory=False,             # in pandas, tells it not to read in chunks for consistent dtypes
        # blocksize=None,                 # function may file if csv contains quoted strings within field -> None tells it not to split files into multiple partitions, at cost of reduced parallelism (in fact, it totally ruins any parallelisation, turning it back to pandas basically)
        )

    if 'date' in ddf.columns:
        ddf['time'] = ddf['date'].apply(lambda text: text[-8:], meta=('date', 'object'))
        ddf['date'] = ddf['date'].apply(lambda text: text[:10], meta=('date', 'object'))
        ddf['date'] = dd.to_datetime(
            ddf['date'], 
            errors='coerce', 
            exact=False,                    # allows match to be found anywhere in string
            infer_datetime_format=True,     # if no 'format' is given, infer it -> potential speed-up, we only look at one year at a time, when the format should be consistent
            )

    if scheduler:
        return scheduler.persist(ddf)
    else:
        return ddf
    


########################################################################################################################
########################################################################################################################
########################################################################################################################

cluster = SLURMCluster(
    cores=48,                          
#     memory='600GB',  
    memory='1400GB',
    
    local_directory='~/scratch',
    job_extra=[
        '--reservation=lab_rdurante_304',
        '--partition=haswell',    
        '--nodes=1',
        '--time=06:00:00',
        
        '--job-name=dask',
        '--output=dask.out', 
        '--error=dask.error', 
        '--mail-user=aaron.schade@upf.edu',
        '--mail-type=NONE', 
    ],    
    n_workers=1,                 # this is internal to one job? one node? 
    
    interface='ib0',               # workers, no diag: em1, em2, ib0,   # no workers: lo, em1.851, idrac, em3 & em4 (no ipv4)
    scheduler_options={
#         'interface': 'em1',      # it wont allow you specify both an interface AND a host address
        'host': '10.30.50.163',    # launch on this address, open dashboard on the other?
    },
)
cluster.scale(jobs=1)


scheduler = Client(cluster)
print(scheduler)
dashboardLink = scheduler.dashboard_link.replace('10.30.50.163', '10.60.110.163')
# dashboardLink = scheduler.dashboard_link
print(dashboardLink)
print(dashboardLink.replace('status', 'workers'))
print(dashboardLink.replace('status', 'graph'))

  from distributed.utils import tmpfile


<Client: 'tcp://10.30.50.163:39924' processes=0 threads=0, memory=0 B>
http://10.60.110.163:8787/status
http://10.60.110.163:8787/workers
http://10.60.110.163:8787/graph


In [4]:
sleep(20)

In [5]:
!squeue -u aschade

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          19065572   haswell     dask  aschade  R       0:19      1 mr-05-01


In [4]:
################################################################################################################################
############################################ original orderProducts data #######################################################
################################################################################################################################

In [11]:
orderProducts = dd.read_parquet('~/scratch/glovoBucket/spain_orders_products_2019_present').repartition(npartitions=100)
orderProducts.head()

Unnamed: 0,order_id,order_date_time,order_category,product_name,product_quantity,order_description,purchases_total_price,total_delivery_time_minutes,courier_total_distance_driven,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,city_name,user_registration_date,user_registration_city,store_signup_date,store_sub_category,store_id,store_address_id,customer_id,courier_id,delivery_revenue,order_total_delivery_revenue_without_tax_eur,order_service_revenue_without_tax_eur,order_weather_revenue_without_tax_eur,order_basket_revenue_without_tax_eur,order_total_partner_commission_eur,order_is_prime,order_bonus_coefficient,order_bonus_reason,order_base_cost_eur,order_bonus_cost_eur,order_courier_rain_bonus_eur,order_total_cost_eur
0,3e1acd54cfbdfe415bd834fca2de8b88ce3fb342,2019-02-01 10:14:02,QUIERO,,,Recoger un pedido de frutos secos de la tienda...,66.83,47.6833,7.179,41.386247,2.184471,41.389679,2.153811,Barcelona,2016-04-22 11:03:45,Pamplona,NaT,,,,f05c7412bc3260c21d1b21bdd667f6aa76dbd8de,066d354c4a7dfed8e0873e4fdae6bc8a1ccda6bb,5.70248,5.70248,0.0,0.0,0.0,0.0,False,130,BAD_WEATHER,5.2912,1.58,1.58,6.8712
1,3a3af7adcb3a5177d8843040dafbd520fe835e44,2019-02-01 10:14:18,SNACKS,Caja grande dulce (32 uds),1.0,1 x Caja grande dulce (32 uds) - Classic x 1...,40.8,93.8167,11.01,40.431601,-3.674932,40.474609,-3.687591,Madrid,2018-10-25 07:40:16,,2018-02-13 12:40:50,Snacks,775bc5c30e27f0e562115d136e7f7edbd3cead89,982fd8b711279888a3b54f5af24f185041d22ee6,a4722747fa818c0d65fe997c0d4839c363bacb25,f1331461a192cfed6392777243cfa63e01ea054f,3.22314,3.22314,0.0,0.0,0.0,8.16,False,100,,6.53673,0.0,0.0,6.53673
2,3a3af7adcb3a5177d8843040dafbd520fe835e44,2019-02-01 10:14:18,SNACKS,Caja grande dulce (32 uds),1.0,1 x Caja grande dulce (32 uds) - Classic x 1...,40.8,93.8167,11.01,40.431601,-3.674932,40.474609,-3.687591,Madrid,2018-10-25 07:40:16,,2018-02-13 12:40:50,Snacks,775bc5c30e27f0e562115d136e7f7edbd3cead89,982fd8b711279888a3b54f5af24f185041d22ee6,a4722747fa818c0d65fe997c0d4839c363bacb25,f1331461a192cfed6392777243cfa63e01ea054f,3.22314,3.22314,0.0,0.0,0.0,8.16,False,100,,6.53673,0.0,0.0,6.53673
3,cf4278f834d0a5659f97ba163f79a93ad2e9852f,2019-02-02 09:43:03,QUIERO,,,2 pastelitos de queso latino\n3 cachitos de qu...,16.8,56.0333,10.601,40.445481,-3.703089,40.48615,-3.65089,Madrid,2016-09-07 10:55:23,Madrid,NaT,,,,359507ce832abc3490740980cf3b1b7bfcd3ff75,ead74f34b8a34bb7e9371765c5233535e906d074,7.35537,7.35537,0.0,0.0,0.0,0.0,False,100,,6.13601,0.0,0.0,6.13601
4,482976c8fb9b07d8414b49cf5a007da35cc0ca7c,2019-02-01 10:14:18,SNACKS,Churros (paquete de 6 uds),1.0,1 x Churros (paquete de 6 uds)\n2 x Porras (pa...,20.0,51.2667,6.959,40.416779,-3.706987,40.408479,-3.689028,Madrid,2016-03-07 08:27:17,,2016-03-19 21:50:45,Snacks,9a15f42d1c524c306eb91c3df1216db248a8f224,329dc1daf9fb9d5e75d687dd9e0740e1c72796c3,e3731c4e641c5ef934325166ed1b3b859abbd045,06fd0e2df097a0f9d669ab164a769106cfd2bf9b,1.57025,1.57025,0.0,0.0,0.0,5.0,False,100,,4.86606,0.0,0.0,4.86606


In [12]:
(
    orderProducts
    .set_index('order_date_time', compute=False)                # npartitions='auto' could be the culprit of the 'DTypeMeta has not attribute xxx'
    .to_parquet(
        '~/scratch/glovo/parquet/orderProducts/', 
        name_function=lambda x: f'orderProducts-{x:03}.parquet',
        engine='fastparquet',               
        compression='snappy', 
        write_index=True, 
        append=False, 
        write_metadata_file=True, 
        schema='infer', 
        )
)

[None]

In [13]:
test = dd.read_parquet('~/scratch/glovo/parquet/orderProducts/')
test.head()

Unnamed: 0_level_0,order_id,order_category,product_name,product_quantity,order_description,purchases_total_price,total_delivery_time_minutes,courier_total_distance_driven,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,city_name,user_registration_date,user_registration_city,store_signup_date,store_sub_category,store_id,store_address_id,customer_id,courier_id,delivery_revenue,order_total_delivery_revenue_without_tax_eur,order_service_revenue_without_tax_eur,order_weather_revenue_without_tax_eur,order_basket_revenue_without_tax_eur,order_total_partner_commission_eur,order_is_prime,order_bonus_coefficient,order_bonus_reason,order_base_cost_eur,order_bonus_cost_eur,order_courier_rain_bonus_eur,order_total_cost_eur
order_date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
2019-02-01 00:00:01,eafbafae869af4f2d96858fbf4f6ccd6fd49ad2e,RESTAURANT,VIPS Club,1.0,1 x Quesadilla de Jamón y Queso - Salsa de Qu...,35.4,37.3,1.134,40.439811,-3.682568,40.447795,-3.679396,Madrid,2016-03-09 19:57:43,Madrid,2017-01-22 11:29:14,Food,7a3673352434418a2371b63e704acf83e13e2ad5,cd5cd943f94aa185849b133cfa5bb3bd80e2c5f3,f9b385830d32774c261b9acd18753bb5cd8c7c0e,c016e429708593e92d15ea336dab6af7188e55d9,2.89256,4.13223,0.0,1.23967,0.0,7.08,False,130,BAD_WEATHER,3.2362,0.96,0.96,4.1962
2019-02-01 00:00:01,eafbafae869af4f2d96858fbf4f6ccd6fd49ad2e,RESTAURANT,Quesadilla de Jamón y Queso,1.0,1 x Quesadilla de Jamón y Queso - Salsa de Qu...,35.4,37.3,1.134,40.439811,-3.682568,40.447795,-3.679396,Madrid,2016-03-09 19:57:43,Madrid,2017-01-22 11:29:14,Food,7a3673352434418a2371b63e704acf83e13e2ad5,cd5cd943f94aa185849b133cfa5bb3bd80e2c5f3,f9b385830d32774c261b9acd18753bb5cd8c7c0e,c016e429708593e92d15ea336dab6af7188e55d9,2.89256,4.13223,0.0,1.23967,0.0,7.08,False,130,BAD_WEATHER,3.2362,0.96,0.96,4.1962
2019-02-01 00:00:01,eafbafae869af4f2d96858fbf4f6ccd6fd49ad2e,RESTAURANT,Chicken Fingers,1.0,1 x Quesadilla de Jamón y Queso - Salsa de Qu...,35.4,37.3,1.134,40.439811,-3.682568,40.447795,-3.679396,Madrid,2016-03-09 19:57:43,Madrid,2017-01-22 11:29:14,Food,7a3673352434418a2371b63e704acf83e13e2ad5,cd5cd943f94aa185849b133cfa5bb3bd80e2c5f3,f9b385830d32774c261b9acd18753bb5cd8c7c0e,c016e429708593e92d15ea336dab6af7188e55d9,2.89256,4.13223,0.0,1.23967,0.0,7.08,False,130,BAD_WEATHER,3.2362,0.96,0.96,4.1962
2019-02-01 00:00:02,793fbf5ab858c8efa87205b2914f72c8e9ebe42b,RESTAURANT,"Hamburguesa de ternera, lechuga, tomate y queso",1.0,1 x Ensalada Mediterránea \n1 x Hamburguesa de...,7.0,34.7333,6.037,39.491367,-0.376666,39.463103,-0.380344,Valencia,2018-12-25 21:27:48,Valencia,2016-11-26 21:45:09,Italian,c28dc1fe18f79267567f68702332aceb8a847953,06166b3e8f5fb777b195dba42555e2aac7c435a8,d1de53de0280ee550b77e94f077da85825dbffd7,11ff9058d397c0b7e1062db8e2e0fe0b7f383ef7,2.39669,2.39669,0.0,0.0,0.0,2.1,False,100,,4.44888,0.0,0.0,4.44888
2019-02-01 00:00:02,793fbf5ab858c8efa87205b2914f72c8e9ebe42b,RESTAURANT,Ensalada Mediterránea,1.0,1 x Ensalada Mediterránea \n1 x Hamburguesa de...,7.0,34.7333,6.037,39.491367,-0.376666,39.463103,-0.380344,Valencia,2018-12-25 21:27:48,Valencia,2016-11-26 21:45:09,Italian,c28dc1fe18f79267567f68702332aceb8a847953,06166b3e8f5fb777b195dba42555e2aac7c435a8,d1de53de0280ee550b77e94f077da85825dbffd7,11ff9058d397c0b7e1062db8e2e0fe0b7f383ef7,2.39669,2.39669,0.0,0.0,0.0,2.1,False,100,,4.44888,0.0,0.0,4.44888


In [None]:
################################################################################################################################
#################################################### derive orders #############################################################
################################################################################################################################

In [7]:
orders = orderProducts.reset_index().groupby(['order_id']).first().repartition(npartitions=100)
orders

Unnamed: 0_level_0,index,order_date_time,order_category,product_name,product_quantity,order_description,purchases_total_price,total_delivery_time_minutes,courier_total_distance_driven,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,city_name,user_registration_date,user_registration_city,store_signup_date,store_sub_category,store_id,store_address_id,customer_id,courier_id,delivery_revenue,order_total_delivery_revenue_without_tax_eur,order_service_revenue_without_tax_eur,order_weather_revenue_without_tax_eur,order_basket_revenue_without_tax_eur,order_total_partner_commission_eur,order_is_prime,order_bonus_coefficient,order_bonus_reason,order_base_cost_eur,order_bonus_cost_eur,order_courier_rain_bonus_eur,order_total_cost_eur
npartitions=400,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
,int64,datetime64[ns],object,object,float64,object,float64,float64,float64,float64,float64,float64,float64,object,datetime64[ns],object,datetime64[ns],object,object,object,object,object,float64,float64,float64,float64,float64,float64,bool,int64,object,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
(
    orders
    .set_index('order_date_time', compute=False)
    .to_csv(
        '~/scratch/glovo/parquet/ordersCSV/', 
        name_function=lambda x: f'orders-{x:03}.csv',
    )
)

In [17]:
ordersCSV = dd.read_csv('~/scratch/glovo/parquet/ordersCSV/*').repartition(npartitions=100)
ordersCSV

Unnamed: 0_level_0,order_date_time,index,order_category,product_name,product_quantity,order_description,purchases_total_price,total_delivery_time_minutes,courier_total_distance_driven,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,city_name,user_registration_date,user_registration_city,store_signup_date,store_sub_category,store_id,store_address_id,customer_id,courier_id,delivery_revenue,order_total_delivery_revenue_without_tax_eur,order_service_revenue_without_tax_eur,order_weather_revenue_without_tax_eur,order_basket_revenue_without_tax_eur,order_total_partner_commission_eur,order_is_prime,order_bonus_coefficient,order_bonus_reason,order_base_cost_eur,order_bonus_cost_eur,order_courier_rain_bonus_eur,order_total_cost_eur
npartitions=100,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
,object,int64,object,object,float64,object,float64,float64,float64,float64,float64,float64,float64,object,object,object,object,object,object,object,object,object,float64,float64,float64,float64,float64,float64,bool,float64,object,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [18]:
ordersCSV = ordersCSV.rename(columns={'index': 'indexNumber'})
ordersCSV = ordersCSV.drop(columns=['product_name', 'product_quantity'])
ordersCSV.order_date_time = dd.to_datetime(ordersCSV.order_date_time)
ordersCSV.dtypes

order_date_time                                 datetime64[ns]
indexNumber                                              int64
order_category                                          object
order_description                                       object
purchases_total_price                                  float64
total_delivery_time_minutes                            float64
courier_total_distance_driven                          float64
pickup_latitude                                        float64
pickup_longitude                                       float64
delivery_latitude                                      float64
delivery_longitude                                     float64
city_name                                               object
user_registration_date                                  object
user_registration_city                                  object
store_signup_date                                       object
store_sub_category                                     

In [19]:
ordersCSV

Unnamed: 0_level_0,order_date_time,indexNumber,order_category,order_description,purchases_total_price,total_delivery_time_minutes,courier_total_distance_driven,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,city_name,user_registration_date,user_registration_city,store_signup_date,store_sub_category,store_id,store_address_id,customer_id,courier_id,delivery_revenue,order_total_delivery_revenue_without_tax_eur,order_service_revenue_without_tax_eur,order_weather_revenue_without_tax_eur,order_basket_revenue_without_tax_eur,order_total_partner_commission_eur,order_is_prime,order_bonus_coefficient,order_bonus_reason,order_base_cost_eur,order_bonus_cost_eur,order_courier_rain_bonus_eur,order_total_cost_eur
npartitions=100,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
,datetime64[ns],int64,object,object,float64,float64,float64,float64,float64,float64,float64,object,object,object,object,object,object,object,object,object,float64,float64,float64,float64,float64,float64,bool,float64,object,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [20]:
(
    ordersCSV
    .set_index('order_date_time', compute=False, npartitions=100)                # npartitions='auto' could be the culprit of the 'DTypeMeta has not attribute xxx'
    .to_parquet(
        '~/scratch/glovo/parquet/orders/', 
        name_function=lambda x: f'orders-{x:03}.parquet',
        engine='fastparquet',               
        compression='snappy', 
        write_index=True, 
        append=False, 
        write_metadata_file=True, 
        schema='infer', 
        )
)
# datetime is missing

[None]

In [21]:
test = dd.read_parquet('~/scratch/glovo/parquet/orders/')
test.dtypes

indexNumber                                       int64
order_category                                   object
order_description                                object
purchases_total_price                           float64
total_delivery_time_minutes                     float64
courier_total_distance_driven                   float64
pickup_latitude                                 float64
pickup_longitude                                float64
delivery_latitude                               float64
delivery_longitude                              float64
city_name                                        object
user_registration_date                           object
user_registration_city                           object
store_signup_date                                object
store_sub_category                               object
store_id                                         object
store_address_id                                 object
customer_id                                     

In [23]:
test.index

Dask Index Structure:
npartitions=100
2019-02-01 00:00:01    datetime64[ns]
2019-02-07 21:20:45               ...
                            ...      
2020-12-27 13:43:57               ...
2021-01-17 16:01:03               ...
Name: order_date_time, dtype: datetime64[ns]
Dask Name: read-parquet, 200 tasks

In [22]:
test.head()

Unnamed: 0_level_0,indexNumber,order_category,order_description,purchases_total_price,total_delivery_time_minutes,courier_total_distance_driven,pickup_latitude,pickup_longitude,delivery_latitude,delivery_longitude,city_name,user_registration_date,user_registration_city,store_signup_date,store_sub_category,store_id,store_address_id,customer_id,courier_id,delivery_revenue,order_total_delivery_revenue_without_tax_eur,order_service_revenue_without_tax_eur,order_weather_revenue_without_tax_eur,order_basket_revenue_without_tax_eur,order_total_partner_commission_eur,order_is_prime,order_bonus_coefficient,order_bonus_reason,order_base_cost_eur,order_bonus_cost_eur,order_courier_rain_bonus_eur,order_total_cost_eur
order_date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
2019-02-01 00:00:01,116,RESTAURANT,1 x Quesadilla de Jamón y Queso - Salsa de Qu...,35.4,37.3,1.134,40.439811,-3.682568,40.447795,-3.679396,Madrid,2016-03-09 19:57:43,Madrid,2017-01-22 11:29:14,Food,7a3673352434418a2371b63e704acf83e13e2ad5,cd5cd943f94aa185849b133cfa5bb3bd80e2c5f3,f9b385830d32774c261b9acd18753bb5cd8c7c0e,c016e429708593e92d15ea336dab6af7188e55d9,2.89256,4.13223,0.0,1.23967,0.0,7.08,False,130.0,BAD_WEATHER,3.2362,0.96,0.96,4.1962
2019-02-01 00:00:02,119,RESTAURANT,1 x Ensalada Mediterránea \n1 x Hamburguesa de...,7.0,34.7333,6.037,39.491367,-0.376666,39.463103,-0.380344,Valencia,2018-12-25 21:27:48,Valencia,2016-11-26 21:45:09,Italian,c28dc1fe18f79267567f68702332aceb8a847953,06166b3e8f5fb777b195dba42555e2aac7c435a8,d1de53de0280ee550b77e94f077da85825dbffd7,11ff9058d397c0b7e1062db8e2e0fe0b7f383ef7,2.39669,2.39669,0.0,0.0,0.0,2.1,False,100.0,,4.44888,0.0,0.0,4.44888
2019-02-01 00:00:07,121,MCDONALDSES,1 x McMenú® Grand McExtreme™ de McDonald's Bac...,16.5,37.0333,4.093,41.403863,2.17347,41.412427,2.163945,Barcelona,2017-07-16 16:46:12,Barcelona,2018-03-05 18:07:21,Burger,9e28ad3523c0b923c3c78bc6feb63dd6bbd8018a,111c0fbe4fd260a6629482ca0b84c368558d9804,652beecd98b7e606b4fdae8356ae1906ae532b5d,f3c984afde79065340b9e2c4eccd57ae2a3f5fd2,0.0,0.0,0.0,0.0,0.0,0.0,False,100.0,,4.44623,0.0,0.0,4.44623
2019-02-01 00:00:13,123,RESTAURANT,2 x TGB Burger\n2 x Patatas Bastón Medianas - ...,16.8,42.0,6.539,40.429598,-3.714873,40.406064,-3.679678,Madrid,2017-06-06 20:31:35,Madrid,2016-05-20 14:46:18,Burger,0fdf6a63ef21d8a712f68633ace5ce08cd1914ac,abb2e1a13ed77d8474c3b10faa63c9332574605f,da51e1606c83913855818dc86779edb62c5d3c51,cc30ec188515740daa39ad8b9bfa3d6c20644159,2.39669,3.78512,1.38843,0.0,0.0,5.04,False,130.0,BAD_WEATHER,4.72326,1.41,1.41,6.13326
2019-02-01 00:00:19,125,RESTAURANT,1 x Deluxe Mix (12 uds.)\n1 x Maki Mix (24 uds.),22.8,56.75,14.671,40.431684,-3.709129,40.381163,-3.743104,Madrid,2018-07-22 19:01:28,,2018-02-22 21:10:23,Japanese,71b609464d8da42a233ab998e58ad9a734b6aae6,c5350ecc830d88e2a7f4934547c002d7eed67df3,6b3d81d47a5728495b34c7ebd6fa6136204b3f20,94d810236080fddaba45a060cad3e680c5682102,2.89256,4.77686,1.8843,0.0,0.0,9.12,False,130.0,BAD_WEATHER,7.50981,2.25,2.25,9.75981


In [None]:
################################################################################################################################
###################################################### stores ##################################################################
################################################################################################################################

In [4]:
cols = ['store_id','store_address_id','store_creation_time','store_name','city_code','store_type','store_tag','is_food','store_segment','store_description','first_order_created','delivered_orders']
len(cols)

12

In [5]:
len(stores.columns)

12

In [2]:
stores = dd.read_csv('~/scratch/glovoBucket/stores/stores.csv') # no header
stores.head()


Unnamed: 0,910f300caeed28799b8cc0784d05d28626e50da7,466af01ae4bf46aaa90e24b69c3fc3e47939fe66,2018-08-02 19:16:22.000000,Wendy's,BUE,RESTAURANT,Burger,true,Big Chain,Calidad es nuestra receta,2018-08-09 10:16:51.000000,11614
0,de05f4b90df6f84ad178a52596b6e181f48a1b26,072c655ca3c54ef0ce418747987cdb1f6398754e,2018-04-19 14:38:44.000000,McDonald's®,LPA,RESTAURANT,,True,Big Chain,,2018-04-23 10:55:32.000000,120854
1,d5e48b4af6184975f41786c351231e5a18ec5d71,4eacc045077ccc25517e485791afe2a801a4e484,2018-01-11 14:12:24.000000,Wendy's,BUE,RESTAURANT,Burger,True,Big Chain,Calidad es nuestra receta,2018-01-23 17:32:41.000000,446
2,96e9dfaaf7ec7ec3416cc4998d96646ef3dc762b,4d5ae3632565990e36b811a3ea82bd2922fe65be,2017-11-23 20:25:36.000000,Mis Costillitas,LIM,RESTAURANT,American,True,Local Hero,Muchos platos y todos muy buenos,2018-03-06 17:32:52.000000,3367
3,5069f8be7f8daa6310a09e278e86315edaaffcbc,4a8e62743fa9073d3d0eacbcca2813966f87b1e5,2017-03-17 18:41:48.000000,Pho 9,PAR,RESTAURANT,International,True,Other,Pho ● €,2017-03-23 21:59:16.000000,311
4,77eb85941882d36ef6788201a0d650d34daaec49,af631a9c3f4b117ca5a297d0e0a2f5f82f825a26,2017-10-04 17:56:19.000000,Mercados,LIS,GROCERIESPT,,False,Other,,2017-10-17 11:19:14.000000,12


In [7]:
(
    stores
    .set_index('date', compute=False)                # npartitions='auto' could be the culprit of the 'DTypeMeta has not attribute xxx'
    .to_parquet(
        '~/scratch/glovo/parquet/stores/', 
        name_function=lambda x: f'stores-{x:03}.parquet',
        engine='fastparquet',               
        compression='snappy', 
        write_index=True, 
        append=False, 
        write_metadata_file=False, # True 
        schema='infer', 
        )
)

KeyError: 'date'

In [None]:
###############################################################################################################
###############################################################################################################
###############################################################################################################

In [5]:
orders = dd.read_parquet('~/scratch/glovo/parquet/orders').reset_index()
orders.head()

Unnamed: 0,date,id,city,duration,total_distance,store_address_id,category,delivery_postal_code,hexagon_h3_hash_8,country_code,courier_id,time
0,2016-01-01,22154,BCN,47.0,0.0,,QUIERO,,,ES,X-27861-T,22:26:38
1,2016-01-01,22073,BCN,27.0,0.0,,QUIERO,,,ES,X-20890-T,20:11:02
2,2016-01-01,22123,MAD,50.0,0.0,,QUIERO,,,ES,X-7500-T,21:31:01
3,2016-01-01,22044,VAL,68.0,0.0,,QUIERO,,8839540a89fffff,ES,X-13621-T,15:22:10
4,2016-01-01,22153,MAD,30.0,0.0,,QUIERO,,88390cb19dfffff,ES,X-32616-T,22:26:09


In [6]:
stores = dd.read_parquet('~/scratch/glovo/parquet/stores')
stores.head()

Unnamed: 0_level_0,store_address_id,store_creation_time,store_name,city_code,store_type,store_tag,is_food,store_segment,store_description,first_order_created,delivered_orders
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,,2016-01-21 13:49:32.000000,Pizza Market,BCN,RESTAURANT,,True,Local Hero,good pizza,,0.0
2,,2016-01-21 13:54:23.000000,McDonalds,BCN,RESTAURANT,,True,Big Chain,good burgers?,,0.0
3,,2016-01-21 13:54:23.000000,Sushi Place,BCN,RESTAURANT,,True,Other,raw fish,,0.0
4,,2016-01-21 13:54:23.000000,Chinese,BCN,RESTAURANT,,True,Other,noodles and other things,,0.0
5,167934.0,2016-01-21 13:54:23.000000,Pedidos Especiales Parafarmacia,BCN,PHARMACYIBERIA,,False,Other,,2016-02-14 16:53:33.000000,0.0


In [9]:
# merging orders and stores

orders = orders.dropna(subset='store_address_id')
stores = stores.dropna(subset='store_address_id')

ordersWithStores = orders.merge(stores, how='inner', on='store_address_id')
ordersWithStores.head()

Unnamed: 0,date,id,city,duration,total_distance,store_address_id,category,delivery_postal_code,hexagon_h3_hash_8,country_code,courier_id,time,store_creation_time,store_name,city_code,store_type,store_tag,is_food,store_segment,store_description,first_order_created,delivered_orders
0,2016-02-11,31963,BCN,40.0,0.0,72.0,,,88394460c7fffff,ES,X-29941-T,10:03:18,2016-02-10 21:27:20.000000,Greenshots,BCN,SNACKS,,True,Other,Zumos naturales sin aditivos,2016-02-11 10:03:18.000000,162.0
1,2016-02-16,33899,BCN,54.0,0.0,72.0,,,88394460c7fffff,ES,X-47259-T,17:09:13,2016-02-10 21:27:20.000000,Greenshots,BCN,SNACKS,,True,Other,Zumos naturales sin aditivos,2016-02-11 10:03:18.000000,162.0
2,2016-02-17,34112,BCN,52.0,0.0,72.0,,,88394460c7fffff,ES,X-37559-T,10:48:45,2016-02-10 21:27:20.000000,Greenshots,BCN,SNACKS,,True,Other,Zumos naturales sin aditivos,2016-02-11 10:03:18.000000,162.0
3,2016-02-18,34696,BCN,66.0,0.0,72.0,SNACKS,8018.0,88394460c7fffff,ES,X-21471-T,11:12:18,2016-02-10 21:27:20.000000,Greenshots,BCN,SNACKS,,True,Other,Zumos naturales sin aditivos,2016-02-11 10:03:18.000000,162.0
4,2016-02-19,35502,BCN,100.0,0.0,72.0,,,88394460c7fffff,ES,X-35589-T,17:18:31,2016-02-10 21:27:20.000000,Greenshots,BCN,SNACKS,,True,Other,Zumos naturales sin aditivos,2016-02-11 10:03:18.000000,162.0


In [10]:
if exists('parquet/ordersWithStores'): shutil.rmtree('parquet/ordersWithStores')

(
    ordersWithStores
    .set_index('date', compute=False)                # npartitions='auto' could be the culprit of the 'DTypeMeta has not attribute xxx'
    .to_parquet(
        '~/scratch/glovo/parquet/ordersWithStores/', 
        name_function=lambda x: f'ordersWithStores-{x:03}.parquet',
        engine='fastparquet',               
        compression='snappy', 
        write_index=True, 
        append=False, 
        write_metadata_file=False, # True 
        schema='infer', 
        )
)

[None]

In [12]:
dd.read_parquet('~/scratch/glovo/parquet/ordersWithStores/').head()

Unnamed: 0_level_0,id,city,duration,total_distance,store_address_id,category,delivery_postal_code,hexagon_h3_hash_8,country_code,courier_id,time,store_creation_time,store_name,city_code,store_type,store_tag,is_food,store_segment,store_description,first_order_created,delivered_orders
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-02-11,31963,BCN,40.0,0.0,72.0,,,88394460c7fffff,ES,X-29941-T,10:03:18,2016-02-10 21:27:20.000000,Greenshots,BCN,SNACKS,,True,Other,Zumos naturales sin aditivos,2016-02-11 10:03:18.000000,162.0
2016-02-11,32069,BCN,87.0,0.0,35.0,,,88394460c7fffff,ES,X-27861-T,13:16:33,2016-02-10 10:15:49.000000,Chivuo's,BCN,RESTAURANT,Burger,True,Other,Un tributo al Street Food,2016-02-11 13:16:33.000000,13437.0
2016-02-11,32003,BCN,57.05,,75.0,SNACKS,8018.0,88394460c7fffff,ES,X-35589-T,11:02:45,2016-02-11 09:27:12.000000,Chok Barcelona,BCN,SNACKS,,True,Other,,2016-02-11 09:36:48.000000,2.0
2016-02-11,31954,BCN,56.0,0.0,75.0,,,88394460c7fffff,ES,X-35589-T,09:36:48,2016-02-11 09:27:12.000000,Chok Barcelona,BCN,SNACKS,,True,Other,,2016-02-11 09:36:48.000000,2.0
2016-02-12,32627,BCN,68.0,0.0,82.0,,,88394460c7fffff,ES,X-29941-T,22:31:05,2016-02-11 11:41:12.000000,Petit Bangkok,BCN,RESTAURANT,Thai,True,Local Hero,El sabor de Thailandia,2016-02-12 22:31:05.000000,14024.0


In [None]:
###############################################################################################################

In [5]:
dtOrders = {
    'id':                      'int', 
    'city':                    'str',
    'date':                    'str', 
    'duration':                'float', 
    'total_distance':          'float', 
    'store_address_id':        'float', 
    'category':                'str', 
    'delivery_postal_code':    'float', 
    'hexagon_h3_hash_8':       'str', 
    'country_code':            'str', 
    'courier_id':              'str', 
}
orders = getCleanDDFfromCSV('~/scratch/glovoBucket/orders*.csv', dtOrders)
orders.head()

Unnamed: 0,id,city,date,duration,total_distance,store_address_id,category,delivery_postal_code,hexagon_h3_hash_8,country_code,courier_id,time
0,50487,MIL,2016-03-22,38.0,0.0,,QUIERO,,881f99cdd5fffff,IT,X-3231-T,11:19:45
1,51505,MIL,2016-03-24,900.0,0.0,,COURIER,,881f99cdd7fffff,IT,X-76218-T,20:22:16
2,51492,MIL,2016-03-24,40.5167,0.0,,COURIER,20121.0,881f99cdddfffff,IT,X-76230-T,20:11:13
3,51461,MIL,2016-03-24,66.0,0.0,,COURIER,,881f99cdd1fffff,IT,X-76265-T,19:35:53
4,51545,MIL,2016-03-24,48.0,0.0,,COURIER,,881f99cddbfffff,IT,X-76225-T,20:56:48


In [6]:
@retry(stop_max_attempt_number=20, retry_on_exception=lambda exception: isinstance(exception, FileNotFoundError))
def convertGlovo():
    global scheduler 
    
    startTime = time()
    try:
        (
            orders
            .set_index('date', npartitions='auto', compute=False)
            .to_parquet(
                '~/scratch/glovo/parquet/orders/', 
                name_function=lambda x: f'orders-{x:03}.parquet',
                engine='fastparquet',               # 'pyarrow' for more robust encoding? (20-30% slower) -- you MUST use the same engine for reading and writing
                compression='snappy', 
                write_index=True, 
                append=False, 
                write_metadata_file=True, 
                schema='infer', 
                )
        )
    
    except FileNotFoundError:
        errorTime = time()
        print(f'time till FileNotFoundError: {round((errorTime - startTime)/60, 1):>8}min')
        scheduler.restart()
        raise(FileNotFoundError)

convertGlovo()


In [7]:
orders = dd.read_parquet('~/scratch/glovo/parquet/orders/')
orders.head()

Unnamed: 0_level_0,id,city,duration,total_distance,store_address_id,category,delivery_postal_code,hexagon_h3_hash_8,country_code,courier_id,time
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-01,22154,BCN,47.0,0.0,,QUIERO,,,ES,X-27861-T,22:26:38
2016-01-01,22073,BCN,27.0,0.0,,QUIERO,,,ES,X-20890-T,20:11:02
2016-01-01,22123,MAD,50.0,0.0,,QUIERO,,,ES,X-7500-T,21:31:01
2016-01-01,22044,VAL,68.0,0.0,,QUIERO,,8839540a89fffff,ES,X-13621-T,15:22:10
2016-01-01,22153,MAD,30.0,0.0,,QUIERO,,88390cb19dfffff,ES,X-32616-T,22:26:09


In [8]:
len(orders)

288094514

In [None]:
###############################################################################################################3

In [None]:
dtStores = {
    'store_id':                 'int', 
    'store_adress_id':          'float', 
    'store_creation_time':      'str', 
    'store_name':               'str',
    'city_code':                'str', 
    'store_type':               'str', 
    'store_tag':                'str', 
    'is_food':                  'bool', 
    'store_segment':            'str',
    'store_description':        'str', 
    'first_order_created':      'str', 
    'delivered_orders':         'float', 
    }

stores = getCleanDDFfromCSV('~/scratch/glovoBucket/stores*.csv', dtStores).rename(columns={'store_adress_id': 'store_address_id'})
stores.head()

Task exception was never retrieved
future: <Task finished name='Task-304' coro=<Client._gather.<locals>.wait() done, defined at /homes/users/aschade/.local/lib/python3.8/site-packages/distributed/client.py:1978> exception=AllExit()>
Traceback (most recent call last):
  File "/homes/users/aschade/.local/lib/python3.8/site-packages/distributed/client.py", line 1983, in wait
    raise AllExit()
distributed.client.AllExit


In [None]:
(
    stores
    .set_index('store_id', npartitions='auto', compute=False)
    .to_parquet(
        '~/scratch/glovo/parquet/stores/', 
        name_function=lambda x: f'stores-{x:03}.parquet',
        engine='fastparquet',               # 'pyarrow' for more robust encoding? (20-30% slower) -- you MUST use the same engine for reading and writing
        compression='snappy', 
        write_index=True, 
        append=False, 
        write_metadata_file=True, 
        schema='infer', 
        )
)

In [None]:
###############################################################################################################

In [None]:
dtCouriers = {
    'courier_id':                       'int', 
    'city':                             'str', 
    'rating':                           'float', 
    'efficiency':                       'float', 
    'first_order_date':                 'str',
    'last_order_date':                  'str', 
    'transport':                        'str', 
    'is_electric':                      'bool', 
    'first_order_id':                   'int',
    'last_order_id':                    'int', 
    'total_distance_driven':            'float', 
    'number_of_delivered_orders':       'float',
}
couriers = getCleanDDFfromCSV('~/scratch/glovoBucket/courier*.csv', dtCouriers)
couriers.head()

Task exception was never retrieved
future: <Task finished name='Task-306' coro=<Client._gather.<locals>.wait() done, defined at /homes/users/aschade/.local/lib/python3.8/site-packages/distributed/client.py:1978> exception=AllExit()>
Traceback (most recent call last):
  File "/homes/users/aschade/.local/lib/python3.8/site-packages/distributed/client.py", line 1983, in wait
    raise AllExit()
distributed.client.AllExit


In [None]:
(
    couriers
    .set_index('courier_id', npartitions='auto', compute=False)
    .to_parquet(
        '~/scratch/glovo/parquet/couriers/', 
        name_function=lambda x: f'couriers-{x:03}.parquet',
        engine='fastparquet',               # 'pyarrow' for more robust encoding? (20-30% slower) -- you MUST use the same engine for reading and writing
        compression='snappy', 
        write_index=True, 
        append=False, 
        write_metadata_file=True, 
        schema='infer', 
        )
)