In [None]:
import requests
from io import BytesIO
import zipfile
import pandas as pd
from pycelonis import get_celonis
import fastparquet as fp
import pyarrow as pa
import time
from datetime import datetime
from os import listdir
from os.path import isfile, join

class cloud:

    def get_api(self, path):
        return "https://{}.{}.celonis.cloud/{}".format(self.tenant, self.realm,
                                                       path)

    def __init__(self, tenant, realm, api_key):
        self.tenant = tenant
        self.realm = realm
        self.api_key = api_key

    def get_jobs_api(self, pool_id):
        return self.get_api("integration/api/v1/data-push/{}/jobs/"
                            .format(pool_id))

    def get_auth(self):
        return {'authorization': "Bearer {}".format(self.api_key)}

    def list_jobs(self, pool_id):
        api = self.get_jobs_api(pool_id)
        return requests.get(api, headers=self.get_auth()).json()

    def delete_job(self, pool_id, job_id):
        api = self.get_jobs_api(pool_id) + "/{}".format(job_id)
        return requests.delete(api, headers=self.get_auth())

    def create_job(self, pool_id, targetName, data_connection_id,
                   upsert=False):
        api = self.get_jobs_api(pool_id)
        job_type = "REPLACE"
        if upsert:
            job_type = "DELTA"
        if not data_connection_id:
            payload = {'targetName': targetName, 'type': job_type,
                       'dataPoolId': pool_id}
        else:
            payload = {'targetName': targetName, 'type': job_type,
                       'dataPoolId': pool_id,
                       'connectionId': data_connection_id}
        return requests.post(api, headers=self.get_auth(), json=payload).json()

    def push_new_dir(self, pool_id, job_id, dir_path):
        files = [join(dir_path, f) for f in listdir(dir_path)
                 if isfile(join(dir_path, f))]
        parquet_files = list(filter(lambda f: f.endswith(".parquet"), files))
        for parquet_file in parquet_files:
            logging.debug(f"Uploading chunk {parquet_file}")
            self.push_new_chunk(pool_id, job_id, parquet_file)

    def push_new_chunk(self, pool_id, job_id, file_path):
        api = self.get_jobs_api(pool_id) + "/{}/chunks/upserted".format(job_id)
        upload_file = {"file": file_path}
        return requests.post(api, files=upload_file, headers=self.get_auth())

    def submit_job(self, pool_id, job_id):
        api = self.get_jobs_api(pool_id) + "/{}/".format(job_id)
        return requests.post(api, headers=self.get_auth())

In [None]:
# Specify the following three parameters for your team
team = '|team|'
realm = '|realm|'
poolid = '|poolid|'
appkey = '|appkey|'
apikey = '|apikey|'
connectionid = None
delta = False

url = f'https://{team}.{realm}.celonis.cloud/storage-manager/api/buckets?feature=SFTP'
header_json = {'Authorization': f'AppKey {appkey}', 'Accept': 'application/json'}
file_response = requests.get(url, headers=header_json)
buckets = [i['id'] for i in file_response.json()]
for bucket_id in buckets:
    print(bucket_id)

In [None]:
#set bucket_id if you leave this blank a random id will be chosen
bucket_id = ''

if bucket_id == '':
    for bucketed_id in buckets:
        bucket_id = bucketed_id


#set folder to scan (default is the root folder)
path_to_file = ''
encoding = None

In [None]:
# get all the file desciptions
url = f'https://{team}.{realm}.celonis.cloud/storage-manager/api/buckets?feature=SFTP'
header_json = {'Authorization': f'AppKey {appkey}', 'Accept': 'application/json'}
file_response = requests.get(url, headers=header_json)
# get all the file names

url = f'https://{team}.{realm}.celonis.cloud/storage-manager/api/buckets/{bucket_id}/files?path=/' + path_to_file
header_json = {'Authorization': f'AppKey {appkey}', 'Accept': 'application/json'}
file_response = requests.get(url, headers=header_json)
files = []
for i in file_response.json()['children']:
    if i['type'] == 'FILE':
         files.append(i['filename'])

In [None]:
# create a list with all the headers in it that are most recent
header = sorted([header for header in files if ('_HEADER' in header)], reverse=True)
header2 = [header[0]]
for i in header:
    if all(i.split('HEADER')[0] not in h for h in header2):
        header2.append(i)
header = header2

jobstatus = {}
uppie = cloud(tenant=team, realm=realm, api_key=apikey)

# start the upload per header
for i in header:
    if True:
        print(i)
        table_files = []
        indices = []
        ref = i.split('HEADER')
        ref[1] = ref[1].replace('.csv', '')
        targetname = ref[0][:-1]
        jobhandle = uppie.create_job(pool_id=poolid,
                             data_connection_id=connectionid,
                             targetName=targetname,
                             upsert=delta)
        jobstatus[jobhandle['id']] = False
        for n in range(len(files)):
            if ref[0] in files[n] and ref[1] in files[n]:
                indices.append(n)
        for m in indices[::-1]:
            table_files.append(files.pop(m))
        url = f'https://{team}.{realm}.celonis.cloud/storage-manager/api/buckets/{bucket_id}/files?path=/' + i
        header_json = {'Authorization': 'AppKey {}'.format(appkey), 'Accept': 'application/octet-stream'}
        with requests.get(url, headers=header_json, stream=False) as r:
            r.raise_for_status()
            df = pd.read_csv(BytesIO(r.content), header=None, dtype=str, sep=' ', names=['names', 'type', 'length', 'declength'], encoding=encoding)

        for file in table_files:
            try:
                url = f'https://{team}.{realm}.celonis.cloud/storage-manager/api/buckets/{bucket_id}/files?path=/' + file
                header_json = {'Authorization': 'AppKey {}'.format(appkey), 'Accept': 'application/octet-stream'}
                with requests.get(url, headers=header_json, stream=False) as r:
                    r.raise_for_status()
                    if 'HEADER' in file:
                        continue
                    elif file.split('.')[-1] == 'zip':
                        z = zipfile.ZipFile(BytesIO(r.content))
                        fh = z.open(z.infolist()[0])
                    else:
                        fh = BytesIO(r.content)
                    df_up = pd.read_csv(fh, header=None, dtype='string', sep=';', names=list(df['names']), quotechar='"', encoding=encoding)
                    buffer = BytesIO()
                    df_up.to_parquet(buffer, index=False, compression='snappy')
                    uppie.push_new_chunk(pool_id=poolid, job_id=jobhandle['id'], file_path=buffer.getvalue())
            except Exception as e:
                print(f'{file} failed with error: {e}')
        uppie.submit_job(pool_id=poolid, job_id=jobhandle['id'])
print('upload done.')
running = True
while running:
    jobs = uppie.list_jobs(poolid)
    for jobids in jobstatus:
        for i in jobs:
            try:
                if i['id'] == jobids:
                    if i['status'] == 'QUEUED':
                        pass
                    elif jobstatus[jobids] is True:
                        pass
                    elif i['status'] == 'DONE':
                        jobstatus[jobids] = True
                    elif i['status'] != 'RUNNING':
                        jobstatus[jobids] = True
                    else:
                        pass
                    break
            except (KeyboardInterrupt, SystemExit):
                print('terminating program\n')
                quit()
            except:
                pass
    if all(status is True for status in jobstatus.values()):
        running = False
        for i in jobs:
            if i['id'] in jobstatus:
                if i['status'] == 'DONE':
                    print(f"{i['targetName']} was successfully installed in the database")
                else:
                    print(f"{i['targetName']} failed with: {i}")
    else:
        time.sleep(15)

In [None]:
for i in jobs:
    if i['id'] in jobstatus:
        if i['status'] == 'DONE':
            print(f"{i['targetName']} was successfully installed in the database")
        else:
            print(f"{i['targetName']} failed with: {i}")