In [1]:
import pandas as pd
from multiprocessing import  Pool
import multiprocessing
import datetime

from functools import partial
import numpy as np

USE_SAMPLE=False
if USE_SAMPLE:
    INPUT_DATASET = "raw-dataset-sampled.parquet"
    OUTPUT_DATASET = "dataset-sampled.parquet"
else:
    INPUT_DATASET = "raw-dataset.parquet"
    OUTPUT_DATASET = "dataset.parquet"

In [2]:
measure_groups = {     'South America': {         'providers': {             'AWS': ['sa-east-1'],             'AZURE': ['brazilsouth'],             'GCP': ['southamerica-east1']         },         'timezone': 'America/Sao_Paulo'     },     'Canada': {         'providers': {             'AWS': ['ca-central-1'],             'AZURE': [],             'GCP': ['northamerica-northeast1']         },         'timezone': 'America/Montreal'     },     'East US': {         'providers': {             'AWS': ['us-east-1'],             'AZURE': ['eastus'],             'GCP': ['us-east4']         },         'timezone': 'America/New_York'     },     'West US': {         'providers': {             'AWS': ['us-west-1', 'us-west-2'],             'AZURE': ['westus', 'westus2'],             'GCP': ['us-west2', 'us-west4']         },         'timezone': 'America/Los_Angeles'     },     'United Kingdom': {         'providers': {             'AWS': ['eu-west-2'],             'AZURE': ['uksouth'],             'GCP': ['europe-west2']         },         'timezone': 'Europe/London'     },     'Germany': {         'providers': {             'AWS': ['eu-central-1'],             'AZURE': ['germanywestcentral'],             'GCP': ['europe-west3']         },         'timezone': 'Europe/Berlin'     },     'India': {         'providers': {             'AWS': ['ap-south-1'],             'AZURE': ['centralindia'],             'GCP': ['asia-south1']         },         'timezone': 'Asia/Kolkata'     },     'Japan': {         'providers': {             'AWS': ['ap-northeast-1'],             'AZURE': ['japaneast'],             'GCP': ['asia-northeast1']         },         'timezone': 'Asia/Tokyo'     },     'Australia': {         'providers': {             'AWS': ['ap-southeast-2'],             'AZURE': ['australiaeast'],             'GCP': ['australia-southeast1']         },         'timezone': 'Australia/Sydney'     } }

In [3]:
dataset = pd.read_parquet(INPUT_DATASET).convert_dtypes()

In [4]:
# Turn dataset into datetime:
dataset['driver_invocation'] = pd.to_datetime(
    dataset['driver_invocation'], format='%Y%m%dT%H%M%S%f')
dataset['workload_invocation'] = pd.to_datetime(
    dataset['workload_invocation'], format='%Y%m%dT%H%M%S%f')

In [5]:
# Cut Off Data
data_from = datetime.datetime(year=2021,month=8,day=1)
data_to = datetime.datetime(year=2021,month=10,day=1)

dataset = dataset[dataset['driver_invocation'] >= data_from]
dataset = dataset[dataset['driver_invocation'] < data_to]

In [6]:
# Preprocessing based on utc:
dataset['dow_utc'] = dataset['driver_invocation'].dt.day_name()
dataset['tod_utc'] = dataset['driver_invocation'].dt.strftime('%H%M')

In [7]:
# Localize Timezones for driver invocation
for mg in measure_groups:
    regions = []
    for provider in measure_groups[mg]['providers']:
        regions.extend(measure_groups[mg]['providers'][provider])

    dataset.loc[(dataset['region'].isin(regions)),
                'timezone'] = measure_groups[mg]['timezone']
    dataset.loc[(dataset['region'].isin(regions)), 'measure group'] = mg

In [8]:
def get_local_dow_of_the_week(df):
    ts = df['driver_invocation']
    tz = df['timezone']
    return  ts.tz_localize('utc').tz_convert(tz).day_name()

def get_local_tod_of_the_week(df):
    ts = df['driver_invocation']
    tz = df['timezone']
    return ts.tz_localize('utc').tz_convert(tz).strftime('%H%M')

def parallelize(data, func, num_of_processes=8):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func):
    return parallelize(data, partial(run_on_subset, func), multiprocessing.cpu_count())

In [9]:
dataset['local_dow'] = parallelize_on_rows(dataset[['driver_invocation', 'timezone']], get_local_dow_of_the_week) 
dataset['local_tod'] = parallelize_on_rows(dataset[['driver_invocation', 'timezone']], get_local_tod_of_the_week) 

In [10]:
dataset[['local_dow', 'local_tod']] = dataset[['local_dow', 'local_tod']].astype('category')

In [11]:
dataset = dataset.convert_dtypes()

In [12]:
custom_dtypes = {
    'SAAFMemoryDeltaError': 'category',
    'SAAFMemoryError': 'category',
    'vendorId': 'category',
    'platform': 'category',
    'payload': 'category',
    'functionRegion': 'category',
    'linuxVersion': 'category',
    'lang': 'category',
    'functionName': 'category',
    'cpuType': 'category',
    'provider': 'category',
    'timezone': 'category',
    'region': 'category',
    'dow_utc': 'category',
    'tod_utc': 'category',
    'measure group': 'category',
    'cpuModel': 'category',
    '2_thread_id': 'category',
    '1_run_id': 'category',
    'newcontainer': 'bool',
    'platform': 'category',
    'version': 'category',
    'containerID': 'category',
}

In [13]:
for column, dtype in custom_dtypes.items():
    dataset[column] = dataset[column].astype(dtype)

In [14]:
for col in dataset.columns:
    print(f"--- {col} ---")
    print(dataset[col].dtypes)
    print(dataset[col].describe())
    print(dataset[col].memory_usage())

--- driver_invocation ---
datetime64[ns]


  print(dataset[col].describe())


count                       46972870
unique                          2926
top       2021-09-09 00:00:03.546000
freq                           16800
first     2021-08-01 00:00:04.087000
last      2021-09-30 23:30:03.438000
Name: driver_invocation, dtype: object
751565920
--- workload_invocation ---
datetime64[ns]
count                       46972870
unique                         78743
top       2021-09-12 12:00:23.049000
freq                            1200
first     2021-08-01 00:00:23.083000
last      2021-09-30 23:31:18.122000
Name: workload_invocation, dtype: object
751565920
--- provider ---
category
count     46972870
unique           3
top            GCP
freq      17549938
Name: provider, dtype: object
422755962
--- region ---
category
count                 46972870
unique                      27
top       australia-southeast1
freq                   1755446
Name: region, dtype: object
422757118
--- dirty_measurement ---
boolean
count     46972870
unique           1
top          

count         46972870
unique               1
top       Hello World!
freq          46972870
Name: message, dtype: object
751565920
--- newcontainer ---
bool
count     46972870
unique           2
top          False
freq      46346249
Name: newcontainer, dtype: object
422755830
--- payload ---
category
count     46972870
unique           1
top             {}
freq      46972870
Name: payload, dtype: object
422755946
--- platform ---
category
count                   46972870
unique                         3
top       Google Cloud Functions
freq                    17549938
Name: platform, dtype: object
422755962
--- roundTripTime ---
Float64
count    4.697287e+07
mean     4.998260e+02
std      8.192320e+02
min      5.549000e+01
25%      1.102800e+02
50%      2.653400e+02
75%      5.749100e+02
max      1.765552e+05
Name: roundTripTime, dtype: float64
798538790
--- runtime ---
Int64
count    4.697287e+07
mean     2.209961e+02
std      2.095756e+02
min      2.400000e+01
25%      6.300000e+01
5

In [15]:
# Drop Columns
drop_cols = ['message', 'payload', 'platform', 'lang', 'cpuNiceDelta', 'cpuIrq', 'version', 'vendorId', 'functionRegion', ]
dataset = dataset.drop(drop_cols, axis = 1)

In [18]:
dataset.memory_usage().sum()

17717306714

In [19]:
dataset.to_parquet(OUTPUT_DATASET, index=False)