In [None]:
# Specify the target url and if you want to do a delta load
from cloud_upload_config import *

if as_string == 'all string':
    as_string = True
else:
    as_string = False

if delta == 'full':
    delta = False
else:
    delta = True
    
if exclude_loaded != 'skip':
    exclude_loaded = False
else:
    exclude_loaded = True

local_file_path = '/home/jovyan/localfiles/'
look_for_sap_files_globally = False
path_to_folder = ''
if local_file_path is not None:
    path_to_folder = local_file_path
continue_from_last_time = False
# this determines how detailed the log is, where INFO is the standard. the list below is ordered from most detailed (DEBUG) to least detailled (CRITICAL)
# logging.DEBUG
# logging.INFO
# logging.WARNING
# logging.ERROR
# logging.CRITICAL
#log_level = logging.DEBUG

In [None]:
global compressed
compressed = ['.tar', '.gz', '.zip', '.7z']

global generic_file_type
generic_file_type = ['.csv', '.xlsx', '.xls']

global sap_file_type
sap_file_type = '(.*)_[0-9]{8}_[0-9]{6}.'

global encrypted
encrypted = ['.gpg', '.pgp']


In [None]:
import logging
from datetime import datetime

#logname = f'IBC_Loader_log_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log'
FORMAT = '%(asctime)s %(levelname)s %(message)s'
formatter = logging.Formatter(FORMAT)
logging.basicConfig(format=FORMAT, filename=logname, level=logging.INFO)
print(logname)
logging.info('logging initialized')

try:
    import re
    import subprocess
    import json
    import requests
    import py7zr
    import tarfile
    from io import BytesIO
    import zipfile
    import gzip
    import pandas as pd
    import numpy as np
    from multiprocessing import Pool
    from pycelonis import get_celonis
    from chardet.universaldetector import UniversalDetector
    import pycelonis
    import fastparquet as fp
    import pyarrow as pa
    import time
    import itertools
    from os import listdir
    from os.path import isfile, join
    from pathlib import Path
    from itertools import product
    import copy
    import sys
    #from lib.cloud_module import cloud
    #from lib.upload_module import import_sap_header
except ModuleNotFoundError as e:
    logging.error(e)
    logging.error('please install missing packages to use this program.')
    logging.error('shutting down')
    quit()

if agreed != 'yes':
    logging.error('you need to read and accept the terms listed in disclaimer.md')
    quit()

In [None]:
class cloud:

    def get_api(self, path):
        return f"https://{self.tenant}.{self.realm}.celonis.cloud/{path}"

    def __init__(self, tenant, realm, api_key):
        self.tenant = tenant
        self.realm = realm
        self.api_key = api_key

    def get_jobs_api(self, pool_id):
        return self.get_api(f"integration/api/v1/data-push/{pool_id}/jobs/")

    def get_auth(self):
        return {'authorization': f"AppKey {self.api_key}"}

    def list_jobs(self, pool_id):
        api = self.get_jobs_api(pool_id)
        return requests.get(api, headers=self.get_auth()).json()

    def delete_job(self, pool_id, job_id):
        api = self.get_jobs_api(pool_id) + f"/{job_id}"
        return requests.delete(api, headers=self.get_auth())

    def create_job(self, pool_id, targetName, data_connection_id,
                   upsert=False):
        api = self.get_jobs_api(pool_id)
        job_type = "REPLACE"
        if upsert:
            job_type = "DELTA"
        if not data_connection_id:
            payload = {'targetName': targetName, 'type': job_type,
                       'dataPoolId': pool_id}
        else:
            payload = {'targetName': targetName, 'type': job_type,
                       'dataPoolId': pool_id,
                       'connectionId': data_connection_id}
        r = requests.post(api, headers=self.get_auth(), json=payload)
        logging.debug(f'created job with {r}')
        return r.json()

    def push_new_dir(self, pool_id, job_id, dir_path):
        files = [join(dir_path, f) for f in listdir(dir_path)
                 if isfile(join(dir_path, f))]
        parquet_files = list(filter(lambda f: f.endswith(".parquet"), files))
        for parquet_file in parquet_files:
            logging.debug(f"Uploading chunk {parquet_file}")
            self.push_new_chunk(pool_id, job_id, parquet_file)

    def push_new_chunk(self, pool_id, job_id, file_path):
        api = self.get_jobs_api(pool_id) + f"/{job_id}/chunks/upserted"
        upload_file = {"file": file_path}
        r = requests.post(api, files=upload_file, headers=self.get_auth())
        logging.debug(f'pushed new chunk with {r}')
        return r

    def submit_job(self, pool_id, job_id):
        api = self.get_jobs_api(pool_id) + f"/{job_id}"
        r = requests.post(api, headers=self.get_auth())
        logging.debug(f'submitted job {r}')
        return r

In [None]:
def determine_tables_loaded(ibc_team):
    # Create new table with the subset we're interested in
    data = None
    celonis = get_celonis()
    logging.info('checking for tables that have already been loaded.')
    random_name = f'zzz___TEMP___{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
    
    if ibc_team.connectionid is None:
        create_table_from_query_statement = f'CREATE TABLE IF NOT EXISTS "{random_name}" AS (SELECT table_name FROM tables WHERE table_schema = \'{ibc_team.poolid}\');'
    else:
        create_table_from_query_statement = f'CREATE TABLE IF NOT EXISTS "{random_name}" AS (SELECT table_name FROM tables WHERE table_schema = \'{ibc_team.poolid}_{ibc_team.connectionid}\');'
    # table_name FROM tables where table_schema = \'\'
    
    # Create data job and run table creation script
    p = celonis.pools.find(ibc_team.poolid)
    counter = 0
    while counter < 4:
        counter += 1
        try:
            dj = p.create_data_job(random_name)
            transf = dj.create_transformation(random_name, create_table_from_query_statement)
            transf.execute()

            # Create temporary data model in pool and add recently created table, then reload
            dm = p.create_datamodel(random_name)
            try:
                dm.add_tables_from_pool(random_name)
                dm.reload(from_cache=False, wait_for_reload=True)
                time.sleep(3)

                # Find table object in data model and download
                t = dm.tables.find(random_name)

                path = t._get_data_file(Path('.') / random_name)
                data = pd.read_parquet(path)
            except Exception as e:
                logging.error(f'determining what tables have been loaded failed with: {e}')
            finally:
                # Deleting temporary objects
                dm.delete()
                transf.statement = f'DROP TABLE IF EXISTS "{random_name}";'
                transf.execute()
                dj.delete()
                if sys.version_info > (3,8):
                    path.unlink(missing_ok=True)
                else:
                    path.unlink()
            loaded_tables = pd.Series(data['table_name']).tolist()
            try:
                loaded_tables.remove(random_name)
            except:
                pass
            logging.info(f'these tables are already in the data pool: {loaded_tables}')
            break
        except:
            loaded_tables = []
        logging.warning(f'determine_tables_loaded failed for {counter}. time. Retrying.')
        time.sleep(1)
    return loaded_tables

In [None]:
def determine_line_count_of_loaded_tables(ibc_team):
    # Create new table with the subset we're interested in
    data = None
    celonis = get_celonis()
    logging.info('counting lines of tables that have been loaded.')
    
    random_name = f'zzz___TEMP_LC___{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}'
    add_line_counts_statement = []
    add_line_counts_statement.append(f'CREATE TABLE IF NOT EXISTS "{random_name}" ("TABLE" VARCHAR(80), "COUNT" INTEGER);\n')
    
    tables = determine_tables_loaded(ibc_team)
    
    
    if ibc_team.connectionid is None:
        for t in tables:
            add_line_counts_statement.append(f'INSERT INTO "{random_name}" ("TABLE" ,"COUNT") SELECT \'{t}\', COUNT(1) FROM "{t}";\n')
    else:
        for t in tables:
            add_line_counts_statement.append(f'INSERT INTO "{random_name}" ("TABLE" ,"COUNT") SELECT \'{t}\', COUNT(1) FROM <%=DATASOURCE:JDBC%>."{t}";\n')
    add_line_counts_statement = ''.join(add_line_counts_statement)
    
    # table_name FROM tables where table_schema = \'\'
    # Create data job and run table creation script
    p = celonis.pools.find(ibc_team.poolid)
    counter = 0
    while counter < 4:
        counter += 1
        try:
            dj = p.create_data_job(random_name)
            transf = dj.create_transformation(random_name, add_line_counts_statement)
            transf.execute(wait_for_execution=True)
            # Create temporary data model in pool and add recently created table, then reload
            dm = p.create_datamodel(random_name)
            try:
                dm.add_tables_from_pool(random_name)
                dm.reload(from_cache=False, wait_for_reload=True)
                time.sleep(3)

                # Find table object in data model and download
                t = dm.tables.find(random_name)

                path = t._get_data_file(Path('.') / random_name)
                data = pd.read_parquet(path)
            except Exception as e:
                logging.error(f'determining line count per table failed with: {e}')
            finally:
                # Deleting temporary objects
                dm.delete()
                transf.statement = f'DROP TABLE IF EXISTS "{random_name}";'
                transf.execute()
                dj.delete()
                if sys.version_info > (3,8):
                    path.unlink(missing_ok=True)
                else:
                    path.unlink()
            #logging.info(f'these tables are already in the data pool: {loaded_tables}')
            break
        except:
            loaded_tables = []
        logging.warning(f'determine_tables_loaded failed for {counter}. time. Retrying.')
        time.sleep(1)
    return data
    #data.to_excel('lines.xlsx')

In [None]:
def clean_table_name(name):
    name = name.replace('.', '/')
    forbidden = ['|', ' ', ',']
    for f in forbidden:
        name = name.replace(f, '_')
    return name

In [None]:
def const(n):
    yield n
    yield from const(n)
    
def decider(indicator, value):
    if indicator == 'float':
        return flt(value)
    elif indicator == 'int':
        return inti(value)
    elif indicator == 'time':
        return time_casting(value)
    else:
        return dt(value)

def time_casting(df):
    try:
        df = '19700101'+df
        return dt(df)
    except Exception as e:
        logging.error(f'casting {df} to time failed with {e}')

def dt(df):
    try:
        df = pd.to_datetime(df, errors='coerce')
        return df.astype('datetime64')
    except Exception as e:
        logging.error(f'casting {df} to datetime failed with {e}')

def flt(df):
    try:
        mask = df.str.contains('-')
        df = df.str.replace('-', '').str.strip()
        df = pd.to_numeric(df, downcast='float', errors='coerce')
        #df = df.astype(float)
        return df.mask(mask, -df)
    except Exception as e:
        logging.error(f'casting {df} to float failed with {e}')

def inti(df):
    try:
        mask = df.str.contains('-')
        df = df.str.replace('-', '').str.strip()
        df = pd.to_numeric(df, downcast='signed', errors='coerce')
        #df = df.str.replace('-', '').str.strip().astype('int64')
        #df = df.astype(int)
        return df.mask(mask, -df)
    except Exception as e:
        logging.error(f'casting {df} to int failed with {e}')

def type_determination(x):
    if x in ['CURR', 'QUAN', 'DEC', 'FLTP']:
        return 'float'
    elif x in ['INT1', 'INT2', 'INT4', 'PREC']:
        return 'int'
    elif x in ['DATS']:
        return 'date'
    elif x in ['TIMS']:
        return 'time'
    else:
        return 'str'

def sap_load(lst, pwd=None):
    #zip(relevant_files, const(pre_url), const(header_json), const(header), const(jobhandle['id']))
    file = lst[0]
    pre_url = lst[1]
    header_json = lst[2]
    header = lst[3]
    job_id = lst[4]
    encoding = 'utf-8'
    df = lst[5]
    type_dict = lst[6]
    
    try:
        logging.info(f'uploading chunk: {file.file}')
        if pre_url is not None:
            url = pre_url + file.file
            with requests.get(url, headers=header_json, stream=False) as r:
                r.raise_for_status()
                if 'HEADER' in file.file:
                    return
                elif file.file.split('.')[-1] == 'zip':
                    z = zipfile.ZipFile(BytesIO(r.content))
                    fh = z.open(z.infolist()[0])
                elif file.file.split('.')[-1] == '7z':
                    z = py7zr.SevenZipFile(BytesIO(r.content), password=pwd)
                    filename = z.getnames()[0]
                    fh = z.read(filename)[filename]
                    logging.info(fh)
                elif file.file.split('.')[-1] == 'gz':
                    fh = gzip.GzipFile(fileobj=BytesIO(r.content), mode='rb')
                else:
                    fh = BytesIO(r.content)
                df_up = pd.read_csv(fh, header=None, dtype='string', sep=';', names=list(df['names']), quotechar='"', encoding=encoding, escapechar='\\')
        else:
            if 'HEADER' in str(file.file):
                return
            elif str(file.file).split('.')[-1] == 'zip':
                z = zipfile.ZipFile(file.file)
                fh = z.open(z.infolist()[0])
            elif str(file.file).split('.')[-1] == '7z':
                z = py7zr.SevenZipFile(file.file, password=pwd)
                filename = z.getnames()[0]
                fh = z.read(filename)[filename]
                logging.info(fh)
            elif str(file.file).split('.')[-1] == 'gz':
                fh = gzip.GzipFile(file.file, mode='rb')
            else:
                fh = str(file.file)
            df_up = pd.read_csv(fh, header=None, dtype='string', sep=';', names=list(df['names']), quotechar='"', encoding=encoding, escapechar='\\')
        if len(type_dict) > 0:
            for i in type_dict:
                df_up[type_dict[i]] = df_up[type_dict[i]].apply(lambda x: decider(i, x), axis=0)
                logging.debug(f'conversion to {i} resulted in {df_up[type_dict[i]].dtypes.value_counts()}')
        buffer = BytesIO()
        df_up.to_parquet(buffer, index=False, compression='snappy', use_deprecated_int96_timestamps=True, version='2.0')
        uppie.push_new_chunk(pool_id=header.poolid, job_id=job_id, file_path=buffer.getvalue())
    except Exception as e:
        logging.error(f'{file.file} failed with error: {e}')
        raise

def import_sap_header(header, files, jobstatus, uppie, data, location_indicator='local', delta=False, pwd=None):
    try:
        encoding = 'utf-8'
        """
        team: {self.team}
                        realm: {self.realm}
                        poolid: {self.poolid}
                        connectionid: {self.connectionid}
                        appkey: {self.appkey}
                        apikey: {self.apikey}
                        url: {self.url}
        """
        if location_indicator == 'local':
            ref = str(header.file).split('HEADER')
            ref[1] = ref[1].replace('.csv', '')
        elif location_indicator == 'global':
            ref = Path(header.file).name.split('HEADER')
            ref[1] = ref[1].replace('.csv', '')
        else:
            logging.error(f'location indicator {location_indicator} is invalid')
            raise ValueError(f'location indicator {location_indicator} is invalid')
        targetname = Path(ref[0][:-1]).name #.replace(path_to_file, '')
        targetname = clean_table_name(targetname)
        if targetname in data:
            logging.warning(f'skipping {header.file} as table with {targetname} is already present in target pool.')
            return None
        jobhandle = uppie.create_job(pool_id=header.poolid,
                             data_connection_id=header.connectionid,
                             targetName=targetname,
                             upsert=delta)
        logging.info(f'starting to upload {targetname}')
        logging.debug(jobhandle)
        jobstatus[jobhandle['id']] = False
        if header.bucket_id is not None:
            url = f'https://{header.team}.{header.realm}.celonis.cloud/storage-manager/api/buckets/{header.bucket_id}/files?path=/' + header.file
            header_json = {'Authorization': 'AppKey {}'.format(header.appkey), 'Accept': 'application/octet-stream'}
            with requests.get(url, headers=header_json, stream=False) as r:
                r.raise_for_status()
                df = pd.read_csv(BytesIO(r.content), header=None, dtype=str, sep=' ', names=['names', 'type', 'length', 'declength'], encoding=encoding)
        else:
            df = pd.read_csv(str(header.file), header=None, dtype=str, sep=' ', names=['names', 'type', 'length', 'declength'], encoding=encoding)
        df['type'] = df['type'].apply(lambda x: type_determination(x))
        type_dict = {}
        if len(df[df['type'] == 'float']) > 0:
            type_dict['float'] = list(df[df['type'] == 'float']['names'])
        if len(df[df['type'] == 'int']) > 0:
            type_dict['int'] = list(df[df['type'] == 'int']['names'])
        if len(df[df['type'] == 'date']) > 0:
            type_dict['date'] = list(df[df['type'] == 'date']['names'])
        if len(df[df['type'] == 'time']) > 0:
            type_dict['time'] = list(df[df['type'] == 'time']['names'])
        relevant_files = tuple(f for f in files if ref[0] in str(f.file) and ref[1] in str(f.file))
        if header.bucket_id is not None:
            header_json = {'Authorization': 'AppKey {}'.format(header.appkey), 'Accept': 'application/octet-stream'}
            pre_url = f'https://{header.team}.{header.realm}.celonis.cloud/storage-manager/api/buckets/{header.bucket_id}/files?path=/'
            with Pool() as pool:
                pool.map(sap_load, zip(relevant_files, const(pre_url), const(header_json), const(header), const(jobhandle['id']), const(df), const(type_dict)))
        else:
            with Pool() as pool:
                pool.map(sap_load, zip(relevant_files, const(None), const(None), const(header), const(jobhandle['id']), const(df), const(type_dict)))
        uppie.submit_job(pool_id=header.poolid, job_id=jobhandle['id'])
    except Exception as e:
        logging.error(f'importing sap file failed with {e}')
        raise

In [None]:
def detect_encoding(non_sap_file, pwd):
    logging.info(f'starting encoding determination')
    detector = UniversalDetector()
    detector.reset()
    #counter = 0
    try:
        """
        with open(file, 'rb') as file_detect:
        """
        url = f'https://{non_sap_file.team}.{non_sap_file.realm}.celonis.cloud/storage-manager/api/buckets/{non_sap_file.bucket_id}/files?path=/' + non_sap_file.file
        header_json = {'Authorization': 'AppKey {}'.format(non_sap_file.appkey), 'Accept': 'application/octet-stream'}
        with requests.get(url, headers=header_json, stream=True) as file_detect:
            file_detect.raise_for_status()
            if non_sap_file.file.split('.')[-1] == 'zip':
                z = zipfile.ZipFile(BytesIO(file_detect.content))
                zip_content = z.infolist()
                idx = 0
                for c, zips in enumerate(zip_content):
                    if '.csv' in zips.filename:
                        idx = c
                        break
                logging.debug(zip_content[idx])
                fh = z.open(zip_content[idx])
            elif non_sap_file.file.split('.')[-1] == '7z':
                z = py7zr.SevenZipFile(BytesIO(file_detect.content), password=pwd)
                filename = z.getnames()[0]
                fh = z.read(filename)[filename]
                logging.debug(fh)
            elif non_sap_file.file.split('.')[-2] == 'tar':
                fh = tarfile.open(fileobj=BytesIO(file_detect.content), mode='r:gz')
            elif non_sap_file.file.split('.')[-1] == 'gz':
                fh = gzip.GzipFile(fileobj=BytesIO(file_detect.content), mode='rb')
            else:
                fh = BytesIO(file_detect.content)
            for counter, line in enumerate(fh):
                #counter += 1
                detector.feed(line)
                if detector.done:
                    break
                elif counter > 50000:
                    break
        detector.close()
        enc = detector.result['encoding'].lower()
        logging.info(f'{non_sap_file.file} has encoding: {detector.result}')
    except Exception as e:
        logging.error(f'encoding detection failed with: {e}\nreverting to utf-8 as standard')
        enc = 'utf-8'
    return enc

def import_non_sap_file(non_sap_file, jobstatus, uppie, data, pwd=None, delta=False, as_string=True):
    encoding = 'utf-8'
    """
    team: {self.team}
                    realm: {self.realm}
                    poolid: {self.poolid}
                    connectionid: {self.connectionid}
                    appkey: {self.appkey}
                    apikey: {self.apikey}
                    url: {self.url}

    if non_sap_file.size > 350 * 1024 * 1024:
        logging.warning(f'skipped file {non_sap_file.file} because it was too large.')
        return None#
    """
    targetname = Path(non_sap_file.file).name.split('.')[0]
    targetname = clean_table_name(targetname)
    if targetname in data:
        logging.warning(f'skipping {non_sap_file.file} as table with {targetname} is already present in target pool.')
        return None
    jobhandle = uppie.create_job(pool_id=non_sap_file.poolid,
                         data_connection_id=non_sap_file.connectionid,
                         targetName=targetname,
                         upsert=delta)
    logging.info(f'starting to upload {targetname}')
    logging.debug(jobhandle)
    jobstatus[jobhandle['id']] = False
    encoding = detect_encoding(non_sap_file, pwd)
    try:
        url = f'https://{non_sap_file.team}.{non_sap_file.realm}.celonis.cloud/storage-manager/api/buckets/{non_sap_file.bucket_id}/files?path=/' + non_sap_file.file
        header_json = {'Authorization': 'AppKey {}'.format(non_sap_file.appkey), 'Accept': 'application/octet-stream'}
        with requests.get(url, headers=header_json, stream=True) as r:
            r.raise_for_status()
            if non_sap_file.file.split('.')[-1] == 'zip':
                z = zipfile.ZipFile(BytesIO(r.content))
                zip_content = z.infolist()
                idx = 0
                for c, zips in enumerate(zip_content):
                    if '.csv' in zips.filename:
                        idx = c
                        break
                logging.debug(zip_content[idx])
                fh = z.open(zip_content[idx])
            elif non_sap_file.file.split('.')[-1] == '7z':
                z = py7zr.SevenZipFile(BytesIO(r.content), password=pwd)
                filename = z.getnames()[0]
                fh = z.read(filename)[filename]
                logging.debug(fh)
            elif non_sap_file.file.split('.')[-2] == 'tar':
                fh = tarfile.open(fileobj=BytesIO(r.content), mode='r:gz')
            elif non_sap_file.file.split('.')[-1] == 'gz':
                fh = gzip.GzipFile(fileobj=BytesIO(r.content), mode='rb')
            else:
                fh = BytesIO(r.content)
            if non_sap_file.file_type == '.csv':
                df_up = pd.read_csv(fh, dtype='string', sep=',', quotechar='"', encoding=encoding, chunksize=100000)
                for i in df_up:
                    logging.debug(i.head())
                    buffer = BytesIO()
                    i.to_parquet(buffer, index=False, compression='snappy')
                    uppie.push_new_chunk(pool_id=non_sap_file.poolid, job_id=jobhandle['id'], file_path=buffer.getvalue())
            else:
                matches = {}
                pd_config = {
                            'io': fh,
                            'sheet_name': None,
                            'keep_default_na': False,
                            }
                if as_string is True:
                    pd_config['dtype'] = str
                df = pd.read_excel(**pd_config)
                for a, b in product(df, df):
                    col_a = df[a].columns
                    col_b = df[b].columns
                    if (len(col_a) == len(col_b)
                        and (len([i for i, j in zip(col_a, col_b) if i == j])
                             == len(col_b))):
                        matches[str(col_b)] = (matches.get(str(col_b), [a, b])
                                               + [a, b])
                for i in matches:
                    matches[i] = set(matches[i])
                if (len(matches) == 1
                    and len(df) == 1
                    and len(copy.deepcopy(matches).popitem()[1]) == len(df)):
                    for i in df:
                        df = df[i]
                elif (len(matches) == 1
                      and len(copy.deepcopy(matches).popitem()[1]) == len(df)):
                    dfs = []
                    for i in df:
                        dfs.append(df[i])
                    df = pd.concat(dfs, ignore_index=True)
                logging.debug(df.head())
                buffer = BytesIO()
                df.to_parquet(buffer, index=False, compression='snappy')
                uppie.push_new_chunk(pool_id=non_sap_file.poolid, job_id=jobhandle['id'], file_path=buffer.getvalue())
    except Exception as e:
        logging.error(f'{non_sap_file} failed with error: {e}')
        time.sleep(10)
    uppie.submit_job(pool_id=non_sap_file.poolid, job_id=jobhandle['id'])

In [None]:
def ibc_files_to_json(lst, name):
    temp = []
    for i in lst:
        temp.append(i.to_dict())
    with open(name, 'w') as out:
        out.write(json.dumps(temp, indent=4))

In [None]:
def json_to_ibc_files(jsn, url):
    with open(jsn, 'r') as inp:
        text = inp.read()
    dct_lst = json.loads(text)
    ibc_files = []
    for entry in dct_lst:
        entry_dct = entry
        entry_dct['url'] = url
        ibc_files.append(ibc_file(**entry_dct))
    return ibc_files

In [None]:
class ibc_team():
    def parse_url(self, url):
        parts = []
        connectionflag = 1
        try:
            parts.append(re.search('https://([a-z0-9-]+)\.', url).groups()[0])
            parts.append(re.search('\.([a-z0-9-]+)\.celonis', url).groups()[0])
            parts.append(re.search('ui/pools/([a-z0-9-]+)', url).groups()[0])
            try:
                parts.append(re.search('data-connections/[a-z-]+/([a-z0-9-]+)', url)
                             .groups()[0])
            except AttributeError:
                connectionflag = 0
        except AttributeError:
            logging.error(f'{url} this is an unvalid url.')
        logging.debug(f'url has the following parts: {parts} and connectionflag: {connectionflag}')
        return parts, connectionflag
    
    def determine_appkey(self, cmd='printenv | grep CELONIS_API_TOKEN'):
        appkey = subprocess.run(cmd, shell=True, capture_output=True)
        return appkey.stdout.decode('utf-8').split('=')[1].strip()
    
    def __str__(self):
        return f'''team: {self.team}
                    realm: {self.realm}
                    poolid: {self.poolid}
                    connectionid: {self.connectionid}
                    appkey: {self.appkey}
                    apikey: {self.apikey}
                    url: {self.url}'''.replace('                    ', '')
    
    def __repr__(self):
        return f'ibc-team {self.team} in {self.realm}.'
    
    def __init__(self, url):
        parts, connectionflag = self.parse_url(url)
        self.url = url
        self.team = parts[0]
        self.realm = parts[1]
        self.poolid = parts[2]
        if connectionflag == 1:
            self.connectionid = parts[3]
        else:
            self.connectionid = None
        self.appkey = self.determine_appkey()
        self.apikey = self.appkey

    def get_values(self):
        return {'team': self.team,
                'realm': self.realm,
                'poolid': self.poolid,
                'connectionid': self.connectionid,
                'appkey': self.appkey,
                'apikey': self.apikey,
                'url': self.url,
               }

    def find_buckets(self, name=None, id=None, local_file_path=None):
        if local_file_path is not None:
            return [bucket(self.url, local_file_path=local_file_path )]
        else:
            url = f'https://{self.team}.{self.realm}.celonis.cloud/storage-manager/api/buckets?feature=SFTP'
            header_json = {'Authorization': f'AppKey {self.appkey}', 'Accept': 'application/json'}
            file_response = requests.get(url, headers=header_json)
            logging.debug(str(file_response.json()))
            if name is None and id is None:
                return [bucket(self.url, i['id'], i['name']) for i in file_response.json()]
            else:
                return [bucket(self.url, i['id'], i['name']) for i in file_response.json() if i['id'] == id]
        
    def validate_bucket(self, name=None, id=None):
        if name is None and id is None:
            raise ValueError("either name or bucket id need to be specified.")
        url = f'https://{self.team}.{self.realm}.celonis.cloud/storage-manager/api/buckets?feature=SFTP'
        header_json = {'Authorization': f'AppKey {self.appkey}', 'Accept': 'application/json'}
        file_response = requests.get(url, headers=header_json)
        logging.debug(str(file_response.json()))
        result = [{'bucket_id': i['id'], 'bucket_name': i['name']} for i in file_response.json() if (i['id'] == id or i['name']==name)]
        if len(result) == 1:
            pass
        elif len(result) == 0:
            raise NameError('invalid bucket id.')
        else:
            raise ValueError('provided bucket identification is not unique.')

In [None]:
class bucket(ibc_team):
    def __init__(self, url, bucket_id=None, bucket_name=None, local_file_path=None):
        super().__init__(url)
        if bucket_id is not None:
            super().validate_bucket(id=bucket_id)
        elif local_file_path is not None:
            self.validate_folder(local_file_path)
        else:
            raise NameError('error initializing bucket as both bucket_id and local_file_path are None')
        self.bucket_id = bucket_id
        self.bucket_name = bucket_name
        self.local_file_path = local_file_path
    
    def __str__(self):
        return f'{super().__str__()}\nbucket_id: {self.bucket_id}\nlocal_file_path: {self.local_file_path}'
    
    def __repr__(self):
        return f'bucket {self.bucket_id} in {super().__repr__()}'
    
    def validate_folder(self, path):
        p = Path(path)
        if not p.is_dir():
            raise NameError('The folder you are refering to does not exist.')
        glb = p.glob('*')
        if len(list(glb)) < 1:
            raise ValueError('The folder you specified is empty.')
    
    def find_folders(self, path_to_folder=''):
        files = []
        folders = []
        return_folders = []
        try:
            if self.bucket_id is not None:
                url = f'https://{self.team}.{self.realm}.celonis.cloud/storage-manager/api/buckets/{self.bucket_id}/files?path=/' + path_to_folder
                header_json = {'Authorization': f'AppKey {self.appkey}', 'Accept': 'application/json'}
                file_response = requests.get(url, headers=header_json)
                logging.debug(file_response.json()['children'])
                for i in file_response.json()['children']:
                    if i['type'] == 'FILE':
                        files.append({'size': i['size'], 'file': (path_to_folder + i['filename'])})
                    elif i['type'] == 'DIRECTORY':
                        folders.append(path_to_folder + i['filename'] + '/')
            else:
                # TODO: find folders locally
                glb = Path(path_to_folder).glob('*')
                for glb_object in glb:
                    if glb_object.is_file():
                        files.append({'size': '', 'file':glb_object})
                    elif glb_object.is_dir():
                        folders.append(glb_object)
            logging.debug(f'{files}\n\n{folders}\n')
            if len(folders) > 0:
                for f in folders:
                    return_folders.extend(self.find_folders(f))
            if len(files) > 0:
                return_folders.append(folder(url=self.url, bucket_id=self.bucket_id, folder=path_to_folder, files=files, local_file_path=path_to_folder))
            return return_folders
        except Exception as e:
            logging.error(f'{e}')

In [None]:
class folder(bucket):
    def __init__(self, url, folder, files, bucket_id=None, local_file_path=None):
        if bucket_id is not None:
            super().__init__(url, bucket_id=bucket_id)
        elif local_file_path is not None:
            super().__init__(url, local_file_path=local_file_path)
        else:
            raise NameError('error initializing folder as both bucket_id and local_file_path are None')
        self.folder = folder
        self.files = files
    
    def __str__(self):
        return f'{super().__str__()}\nfolder: {self.folder}'
    
    def __repr__(self):
        return f'{self.folder} in {self.bucket_id}'
    
    def classify_files(self):
        logging.debug(f'{self.files[0]} is the first file from {len(self.files)} to be classified.')
        head, body = [], []
        for file in self.files:
            file_characteristics = {
                'url': self.url,
                'bucket_id': self.bucket_id,
                'local_file_path': self.local_file_path,
                'folder': self.folder,
                'file': file['file'],
                'size': file['size'],
                'file_type': None,
                'header': False,
                'encryption': None,
                'compression': None,
            }
            p = Path(file['file'])
            logging.debug(f'file {p} has suffixes {p.suffixes}')
            
            if len(re.findall(sap_file_type, p.name)):
                file_characteristics['file_type'] = 'sap'
                if '_HEADER_' in p.name:
                    file_characteristics['header'] = True
            else:
                for s in p.suffixes:
                    if s.lower() in generic_file_type:
                        file_characteristics['file_type'] = s.lower()
                    elif s.lower() in compressed:
                        if file_characteristics['compression'] is None:
                            file_characteristics['compression'] = s.lower()
                        else:
                            file_characteristics['compression'] += s.lower()
                    elif s.lower() in encrypted:
                        if file_characteristics['encryption'] is None:
                            file_characteristics['encryption'] = s.lower()
                        else:
                            file_characteristics['encryption'] += s.lower()
            logging.debug(f'file {file["file"]} has the following traits: {file_characteristics}')
            if file_characteristics['file_type'] is None and file_characteristics['compression'] is None:
                logging.warning(f'{file["file"]} with traits: {file_characteristics} is of wrong file type.')
            elif file_characteristics['file_type'] != 'sap' or file_characteristics['header'] is True:
                file_tmp = ibc_file(**file_characteristics)
                logging.debug(file_tmp)
                head.append(file_tmp)
            else:
                file_tmp = ibc_file(**file_characteristics)
                logging.debug(file_tmp)
                body.append(file_tmp)
        return head, body

In [None]:
class ibc_file(folder):
    # ABAP, Header, csv, gz, zip, 7z, pgp, gpg
    def __init__(self, url, folder, file, file_type, encryption, header, compression, size=None, files=None, bucket_id=None, local_file_path=None):
        super().__init__(url=url, bucket_id=bucket_id, folder=folder, files=files, local_file_path=local_file_path)
        self.file = file
        self.file_type = file_type
        self.encryption = encryption
        self.header = header
        self.compression = compression
        self.bucket_id = bucket_id
        self.folder = folder
        self.size = size
        self.local_file_path = local_file_path

    def __str__(self):
        return f'{super().__str__()}\nfile: {self.file}'
    
    def __repr__(self):
        return f'{self.file} of {self.file_type} with header being {self.header}'
    
    def to_dict(self):
        return {'file': str(self.file),
                'file_type': self.file_type,
                'encryption': self.encryption,
                'header': self.header,
                'compression': self.compression,
                'bucket_id': self.bucket_id,
                'folder': str(self.folder),
                'size': self.size,
                'local_file_path': str(self.local_file_path),
               }

In [None]:
c = ibc_team(url)

In [None]:
if exclude_loaded is True:
    data = [] #determine_tables_loaded(c)
else:
    data = []

In [None]:
if continue_from_last_time is True and Path('./head.json').is_file():
    logging.info('getting ibc_files from ML Workbench')
    if Path('./body.json').is_file():
        body = json_to_ibc_files('body.json', url)
    else:
        body = []
    head = json_to_ibc_files('head.json', url)
else:
    logging.info('getting ibc_files from SFTP')
    head, body = [], []
    buckets = c.find_buckets(local_file_path=local_file_path)
    for b in buckets:
        logging.info(f'started finding folders at: {datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}')
        f = b.find_folders(path_to_folder)
        logging.info(f'finished finding folders at: {datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}')
        try:
            logging.info(f'started classifying files at: {datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}')
            for i in f:
                head_instance, body_instance = i.classify_files()
                head.extend(head_instance)
                body.extend(body_instance)
            logging.info(f'finished classifying files at: {datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}')
        except Exception as e:
            logging.error(f'encounterd error {e} while processing {i} in {f}')
        ibc_files_to_json(head, 'head.json')
        ibc_files_to_json(body, 'body.json')

In [None]:
"""
lenght_before = len(head)
head = list(head)
for date in data:
    for h in head:
        if date == Path(h.file).name.split('_HEADER_')[0]:
            print(date, Path(h.file).name.split('_HEADER_')[0])
            head.remove(h)
head = tuple(head)
body = tuple(body)
"""
logging.info(f'finished classifying files. {len(head)} header and {len(body)} sub files were found.') # {lenght_before - len(head)} files were skipped.')

In [None]:
if look_for_sap_files_globally is True:
    location_indicator = 'global'
else:
    location_indicator = 'local'
jobstatus = {}
uppie = cloud(tenant=c.team, realm=c.realm, api_key=c.apikey)
for header in head:
    if header.file_type == 'sap':
        import_sap_header(header, body, jobstatus, uppie, data, delta=False, location_indicator=location_indicator)
    else:
        continue
        import_non_sap_file(header, jobstatus, uppie, data, delta=False)

In [None]:
logging.info('upload done.')
running = True
while running:
    jobs = uppie.list_jobs(c.poolid)
    for jobids in jobstatus:
        for i in jobs:
            try:
                if i['id'] == jobids:
                    if i['status'] == 'QUEUED':
                        pass
                    elif jobstatus[jobids] is True:
                        pass
                    elif i['status'] == 'DONE':
                        jobstatus[jobids] = True
                    elif i['status'] != 'RUNNING':
                        jobstatus[jobids] = True
                    else:
                        pass
                    break
            except (KeyboardInterrupt, SystemExit):
                logging.error('terminating program\n')
                quit()
            except:
                pass
    if all(status is True for status in jobstatus.values()):
        running = False
        for i in jobs:
            if i['id'] in jobstatus:
                if i['status'] == 'DONE':
                    logging.info(f"{i['targetName']} was successfully installed in the database")
                else:
                    logging.error(f"{i['targetName']} failed with: {i}")
    else:
        time.sleep(15)