| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,7 @@ | ||
| pytest | ||
| numpy>=1.7.0 | ||
| pandas>=0.12.0 | ||
| impyla>=0.10.0 | ||
| psutil==0.6.1 | ||
| hdfs==1.4.3 | ||
| six |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,379 @@ | ||
| #! /usr/bin/env python | ||
| # Copyright 2015 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import sys | ||
| import shutil | ||
| import tempfile | ||
| import os.path as osp | ||
| from os.path import join as pjoin | ||
| from subprocess import check_call | ||
|
|
||
| import pandas as pd | ||
| from click import group, option | ||
|
|
||
| import ibis | ||
| from ibis.compat import BytesIO | ||
| from ibis.common import IbisError | ||
| from ibis.tests.util import IbisTestEnv | ||
|
|
||
|
|
||
| ENV = IbisTestEnv() | ||
| IBIS_TEST_DATA_S3_BUCKET = 'ibis-test-resources' | ||
| IBIS_TEST_DATA_LOCAL_DIR = 'ibis-testing-data' | ||
| IBIS_TEST_DATA_TARBALL = 'ibis-testing-data.tar.gz' | ||
|
|
||
|
|
||
| def make_ibis_client(): | ||
| ic = ibis.impala.connect(host=ENV.impala_host, port=ENV.impala_port, | ||
| protocol=ENV.impala_protocol, | ||
| use_kerberos=ENV.use_kerberos) | ||
| if ENV.use_kerberos: | ||
| print("Warning: ignoring invalid Certificate Authority errors") | ||
| hc = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port, | ||
| use_kerberos=ENV.use_kerberos, | ||
| verify=(not ENV.use_kerberos)) | ||
| return ibis.make_client(ic, hdfs_client=hc) | ||
|
|
||
|
|
||
| def can_write_to_hdfs(con): | ||
| test_path = pjoin(ENV.test_data_dir, ibis.util.guid()) | ||
| test_file = BytesIO(ibis.util.guid()) | ||
| try: | ||
| con.hdfs.put(test_path, test_file) | ||
| con.hdfs.rm(test_path) | ||
| return True | ||
| except: | ||
| return False | ||
|
|
||
|
|
||
| def can_build_udfs(): | ||
| try: | ||
| check_call('which cmake', shell=True) | ||
| check_call('which make', shell=True) | ||
| check_call('which clang++', shell=True) | ||
| return True | ||
| except: | ||
| return False | ||
|
|
||
|
|
||
| def is_data_loaded(con): | ||
| if not con.hdfs.exists(ENV.test_data_dir): | ||
| return False | ||
| if not con.exists_database(ENV.test_data_db): | ||
| return False | ||
| return True | ||
|
|
||
|
|
||
| def is_udf_loaded(con): | ||
| bitcode_dir = pjoin(ENV.test_data_dir, 'udf') | ||
| if con.hdfs.exists(bitcode_dir): | ||
| return True | ||
| return False | ||
|
|
||
|
|
||
| def dnload_ibis_test_data_from_s3(local_path): | ||
| url = 'https://{0}.s3.amazonaws.com/{1}'.format( | ||
| IBIS_TEST_DATA_S3_BUCKET, IBIS_TEST_DATA_TARBALL) | ||
| cmd = 'cd {0} && wget -q {1} && tar -xzf {2}'.format( | ||
| local_path, url, IBIS_TEST_DATA_TARBALL) | ||
| check_call(cmd, shell=True) | ||
| data_dir = pjoin(local_path, IBIS_TEST_DATA_LOCAL_DIR) | ||
| print('Downloaded {0} and unpacked it to {1}'.format(url, data_dir)) | ||
| return data_dir | ||
|
|
||
|
|
||
| def upload_ibis_test_data_to_hdfs(con, data_path): | ||
| hdfs = con.hdfs | ||
| if hdfs.exists(ENV.test_data_dir): | ||
| hdfs.rmdir(ENV.test_data_dir) | ||
| hdfs.put(ENV.test_data_dir, data_path, verbose=True) | ||
|
|
||
|
|
||
| def create_test_database(con): | ||
| if con.exists_database(ENV.test_data_db): | ||
| con.drop_database(ENV.test_data_db, force=True) | ||
| con.create_database(ENV.test_data_db) | ||
| print('Created database {0}'.format(ENV.test_data_db)) | ||
|
|
||
|
|
||
| def create_parquet_tables(con): | ||
| parquet_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'parquet')) | ||
| schemas = { | ||
| 'functional_alltypes': ibis.schema( | ||
| [('id', 'int32'), | ||
| ('bool_col', 'boolean'), | ||
| ('tinyint_col', 'int8'), | ||
| ('smallint_col', 'int16'), | ||
| ('int_col', 'int32'), | ||
| ('bigint_col', 'int64'), | ||
| ('float_col', 'float'), | ||
| ('double_col', 'double'), | ||
| ('date_string_col', 'string'), | ||
| ('string_col', 'string'), | ||
| ('timestamp_col', 'timestamp'), | ||
| ('year', 'int32'), | ||
| ('month', 'int32')]), | ||
| 'tpch_region': ibis.schema( | ||
| [('r_regionkey', 'int16'), | ||
| ('r_name', 'string'), | ||
| ('r_comment', 'string')])} | ||
| tables = [] | ||
| for path in parquet_files: | ||
| head, table_name = osp.split(path) | ||
| print('Creating {0}'.format(table_name)) | ||
| # if no schema infer! | ||
| schema = schemas.get(table_name) | ||
| table = con.parquet_file(path, schema=schema, name=table_name, | ||
| database=ENV.test_data_db, persist=True) | ||
| tables.append(table) | ||
| return tables | ||
|
|
||
|
|
||
| def create_avro_tables(con): | ||
| avro_files = con.hdfs.ls(pjoin(ENV.test_data_dir, 'avro')) | ||
| schemas = { | ||
| 'tpch_region_avro': { | ||
| 'type': 'record', | ||
| 'name': 'a', | ||
| 'fields': [ | ||
| {'name': 'R_REGIONKEY', 'type': ['null', 'int']}, | ||
| {'name': 'R_NAME', 'type': ['null', 'string']}, | ||
| {'name': 'R_COMMENT', 'type': ['null', 'string']}]}} | ||
| tables = [] | ||
| for path in avro_files: | ||
| head, table_name = osp.split(path) | ||
| print('Creating {0}'.format(table_name)) | ||
| schema = schemas[table_name] | ||
| table = con.avro_file(path, schema, name=table_name, | ||
| database=ENV.test_data_db, persist=True) | ||
| tables.append(table) | ||
| return tables | ||
|
|
||
|
|
||
| def build_udfs(): | ||
| print('Building UDFs') | ||
| ibis_home_dir = osp.dirname(osp.dirname(osp.abspath(__file__))) | ||
| udf_dir = pjoin(ibis_home_dir, 'testing', 'udf') | ||
| check_call('cmake . && make', shell=True, cwd=udf_dir) | ||
|
|
||
|
|
||
| def upload_udfs(con): | ||
| ibis_home_dir = osp.dirname(osp.dirname(osp.abspath(__file__))) | ||
| build_dir = pjoin(ibis_home_dir, 'testing', 'udf', 'build') | ||
| bitcode_dir = pjoin(ENV.test_data_dir, 'udf') | ||
| print('Uploading UDFs to {0}'.format(bitcode_dir)) | ||
| if con.hdfs.exists(bitcode_dir): | ||
| con.hdfs.rmdir(bitcode_dir) | ||
| con.hdfs.put(bitcode_dir, build_dir, verbose=True) | ||
|
|
||
|
|
||
| def scrape_parquet_files(con): | ||
| to_scrape = [('tpch', x) for x in con.list_tables(database='tpch')] | ||
| to_scrape.append(('functional', 'alltypes')) | ||
| for db, tname in to_scrape: | ||
| table = con.table(tname, database=db) | ||
| new_name = '{0}_{1}'.format(db, tname) | ||
| print('Creating {0}'.format(new_name)) | ||
| con.create_table(new_name, table, database=tmp_db) | ||
|
|
||
|
|
||
| def download_parquet_files(con, tmp_db_hdfs_path): | ||
| parquet_path = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'parquet') | ||
| print("Downloading {0}".format(parquet_path)) | ||
| con.hdfs.get(tmp_db_hdfs_path, parquet_path) | ||
|
|
||
|
|
||
| def download_avro_files(con): | ||
| avro_hdfs_path = '/test-warehouse/tpch.region_avro' | ||
| avro_local_path = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'avro') | ||
| os.mkdir(avro_local_path) | ||
| print("Downloading {0}".format(avro_hdfs_path)) | ||
| con.hdfs.get(avro_hdfs_path, pjoin(avro_local_path, 'tpch_region_avro')) | ||
|
|
||
|
|
||
| def generate_csv_files(): | ||
| N = 10 | ||
| nfiles = 10 | ||
| df = pd.DataFrame({'foo': [tm.rands(10) for _ in xrange(N)], | ||
| 'bar': np.random.randn(N), | ||
| 'baz': np.random.randint(0, 100, size=N)}, | ||
| columns=['foo', 'bar', 'baz']) | ||
| csv_base = pjoin(IBIS_TEST_DATA_LOCAL_DIR, 'csv') | ||
| os.mkdir(csv_base) | ||
| for i in xrange(nfiles): | ||
| csv_path = pjoin(csv_base, '{0}.csv'.format(i)) | ||
| print('Writing {0}'.format(csv_path)) | ||
| df.to_csv(csv_path, index=False, header=False) | ||
|
|
||
|
|
||
| def copy_tarball_to_versioned_backup(bucket): | ||
| key = bucket.get_key(IBIS_TEST_DATA_TARBALL) | ||
| if key: | ||
| names = [k.name for k in bucket.list(prefix=IBIS_TEST_DATA_TARBALL)] | ||
| names.remove(IBIS_TEST_DATA_TARBALL) | ||
| # get the highest number for this key name | ||
| last = sorted([int(names.split('.')[-1]) for name in names])[-1] | ||
| next_key = '{0}.{1}'.format(IBIS_TEST_DATA_TARBALL, last + 1) | ||
| key.copy(IBIS_TEST_DATA_S3_BUCKET, next_key) | ||
| key.delete() | ||
| assert bucket.get_key(IBIS_TEST_DATA_TARBALL) is None | ||
|
|
||
|
|
||
| # ========================================== | ||
|
|
||
|
|
||
| @group(context_settings={'help_option_names': ['-h', '--help']}) | ||
| def main(): | ||
| """Manage test data for Ibis""" | ||
| pass | ||
|
|
||
|
|
||
| @main.command() | ||
| def printenv(): | ||
| """Print current IbisTestEnv""" | ||
| print(str(ENV)) | ||
|
|
||
|
|
||
| @main.command() | ||
| @option('--create-tarball', is_flag=True, | ||
| help="Create a gzipped tarball") | ||
| @option('--push-to-s3', is_flag=True, | ||
| help="Also push the tarball to s3://ibis-test-resources") | ||
| def create(create_tarball, push_to_s3): | ||
| """Create Ibis test data""" | ||
| print(str(ENV)) | ||
|
|
||
| con = make_ibis_client() | ||
|
|
||
| # verify some assumptions before proceeding | ||
| if push_to_s3 and not create_tarball: | ||
| raise IbisError( | ||
| "Must specify --create-tarball if specifying --push-to-s3") | ||
| if osp.exists(IBIS_TEST_DATA_LOCAL_DIR): | ||
| raise IbisError( | ||
| 'Local dir {0} already exists; please remove it first'.format( | ||
| IBIS_TEST_DATA_LOCAL_DIR)) | ||
| if not con.exists_database('tpch'): | ||
| raise IbisError('`tpch` database does not exist') | ||
| if not con.hdfs.exists('/test-warehouse/tpch.region_avro'): | ||
| raise IbisError( | ||
| 'HDFS dir /test-warehouse/tpch.region_avro does not exist') | ||
|
|
||
| # generate tmp identifiers | ||
| tmp_db_hdfs_path = pjoin(ENV.tmp_dir, guid()) | ||
| tmp_db = guid() | ||
| os.mkdir(IBIS_TEST_DATA_LOCAL_DIR) | ||
| try: | ||
| # create the tmp data locally | ||
| con.create_database(tmp_db, path=tmp_db_hdfs_path) | ||
| print('Created database {0} at {1}'.format(tmp_db, tmp_db_hdfs_path)) | ||
|
|
||
| # create the local data set | ||
| scrape_parquet_files(con) | ||
| download_parquet_files(con, tmp_db_hdfs_path) | ||
| download_avro_files(con) | ||
| generate_csv_files() | ||
| finally: | ||
| con.drop_database(tmp_db, force=True) | ||
| assert not con.hdfs.exists(TMP_DB_HDFS_PATH) | ||
|
|
||
| if create_tarball: | ||
| check_call('tar -xzf {0} {1}'.format(IBIS_TEST_DATA_TARBALL, | ||
| IBIS_TEST_DATA_LOCAL_DIR), | ||
| shell=True) | ||
|
|
||
| if push_to_s3: | ||
| from boto.s3 import connect_to_region | ||
| s3_conn = connect_to_region('us-west-2') | ||
| bucket = s3_conn.get_bucket(IBIS_TEST_DATA_S3_BUCKET) | ||
| copy_tarball_to_versioned_backup(bucket) | ||
| key = bucket.new_key(IBIS_TEST_DATA_TARBALL) | ||
| print('Upload tarball to S3') | ||
| key.set_contents_from_filename(IBIS_TEST_DATA_TARBALL, replace=False) | ||
|
|
||
|
|
||
| @main.command() | ||
| @option('--data/--no-data', default=True, help='Load (skip) ibis testing data') | ||
| @option('--udf/--no-udf', default=True, help='Build/upload (skip) test UDFs') | ||
| @option('--data-dir', | ||
| help='Path to testing data; dnloads data from S3 if unset') | ||
| @option('--overwrite', is_flag=True, help='Forces overwriting of data/UDFs') | ||
| def load(data, udf, data_dir, overwrite): | ||
| """Load Ibis test data and build/upload UDFs""" | ||
| print(str(ENV)) | ||
|
|
||
| con = make_ibis_client() | ||
|
|
||
| # validate our environment before performing possibly expensive operations | ||
| if not can_write_to_hdfs(con): | ||
| raise IbisError('Failed to write to HDFS; check your settings') | ||
| if udf and not can_build_udfs(): | ||
| raise IbisError('Build environment does not support building UDFs') | ||
|
|
||
| # load the data files | ||
| if data and (overwrite or not is_data_loaded(con)): | ||
| try: | ||
| tmp_dir = tempfile.mkdtemp(prefix='__ibis_tmp_') | ||
| if not data_dir: | ||
| print('Did not specify a local dir with the test data, so ' | ||
| 'downloading it from S3') | ||
| data_dir = dnload_ibis_test_data_from_s3(tmp_dir) | ||
| upload_ibis_test_data_to_hdfs(con, data_dir) | ||
| create_test_database(con) | ||
| parquet_tables = create_parquet_tables(con) | ||
| avro_tables = create_avro_tables(con) | ||
| for table in parquet_tables + avro_tables: | ||
| print('Computing stats for {0}'.format(table.op().name)) | ||
| table.compute_stats() | ||
| finally: | ||
| shutil.rmtree(tmp_dir) | ||
|
|
||
| # build and upload the UDFs | ||
| if udf and (overwrite or not is_udf_loaded(con)): | ||
| build_udfs() | ||
| upload_udfs(con) | ||
|
|
||
|
|
||
| @main.command() | ||
| @option('--test-data', is_flag=True, | ||
| help='Cleanup Ibis test data, test database, and also the test UDFs ' | ||
| 'if they are stored in the test data directory/database') | ||
| @option('--udfs', is_flag=True, help='Cleanup Ibis test UDFs only') | ||
| @option('--tmp-data', is_flag=True, | ||
| help='Cleanup Ibis temporary HDFS directory') | ||
| @option('--tmp-db', is_flag=True, help='Cleanup Ibis temporary database') | ||
| def cleanup(test_data, udfs, tmp_data, tmp_db): | ||
| """Cleanup Ibis test data and UDFs""" | ||
| print(str(ENV)) | ||
|
|
||
| con = make_ibis_client() | ||
|
|
||
| if udfs: | ||
| # this comes before test_data bc the latter clobbers this too | ||
| con.hdfs.rmdir(pjoin(ENV.test_data_dir, 'udf')) | ||
|
|
||
| if test_data: | ||
| con.drop_database(ENV.test_data_db, force=True) | ||
| con.hdfs.rmdir(ENV.test_data_dir) | ||
|
|
||
| if tmp_data: | ||
| con.hdfs.rmdir(ENV.tmp_dir) | ||
|
|
||
| if tmp_db: | ||
| con.drop_database(ENV.tmp_db, force=True) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| # Copyright 2012 Cloudera Inc. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| cmake_minimum_required(VERSION 2.6) | ||
|
|
||
| # where to put generated libraries | ||
| set(LIBRARY_OUTPUT_PATH "build") | ||
| # where to put generated binaries | ||
| set(EXECUTABLE_OUTPUT_PATH "build") | ||
|
|
||
| find_program(CLANG_EXECUTABLE clang++) | ||
|
|
||
| SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ggdb") | ||
|
|
||
| # Function to generate rule to cross compile a source file to an IR module. | ||
| # This should be called with the .cc src file and it will generate a | ||
| # src-file-ir target that can be built. | ||
| # e.g. COMPILE_TO_IR(test.cc) generates the "test-ir" make target. | ||
| set(IR_COMPILE_FLAGS "-emit-llvm" "-O3" "-c") | ||
| function(COMPILE_TO_IR SRC_FILE) | ||
| get_filename_component(BASE_NAME ${SRC_FILE} NAME_WE) | ||
| set(OUTPUT_FILE "build/${BASE_NAME}.ll") | ||
| add_custom_command( | ||
| OUTPUT ${OUTPUT_FILE} | ||
| COMMAND ${CLANG_EXECUTABLE} ${IR_COMPILE_FLAGS} ${SRC_FILE} -o ${OUTPUT_FILE} | ||
| DEPENDS ${SRC_FILE}) | ||
| add_custom_target(${BASE_NAME}-ir ALL DEPENDS ${OUTPUT_FILE}) | ||
| endfunction(COMPILE_TO_IR) | ||
|
|
||
| # Build the UDA/UDFs into a shared library. | ||
| add_library(udfsample SHARED udf-sample.cc) | ||
| add_library(udasample SHARED uda-sample.cc hyperloglog-uda.cc variance-uda.cc) | ||
|
|
||
| # Custom targest to cross compile UDA/UDF to ir | ||
| if (CLANG_EXECUTABLE) | ||
| COMPILE_TO_IR(udf-sample.cc ) | ||
| COMPILE_TO_IR(uda-sample.cc ) | ||
| endif(CLANG_EXECUTABLE) | ||
|
|
||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include <assert.h> | ||
| #include <math.h> | ||
| #include <algorithm> | ||
| #include <sstream> | ||
| #include <iostream> | ||
| #include "lib/udf.h" | ||
|
|
||
| using namespace std; | ||
| using namespace impala_udf; | ||
|
|
||
| // This sample UDA implements the hyperloglog distinct estimate aggregate | ||
| // function. | ||
| // See these papers for more details. | ||
| // 1) Hyperloglog: The analysis of a near-optimal cardinality estimation algorithm (2007) | ||
| // 2) HyperLogLog in Practice | ||
|
|
||
| // Precision taken from the paper. Doesn't seem to matter very much when between [6,12] | ||
| const int HLL_PRECISION = 10; | ||
|
|
||
| void HllInit(FunctionContext* ctx, StringVal* dst) { | ||
| int str_len = pow(2, HLL_PRECISION); | ||
| dst->is_null = false; | ||
| dst->ptr = ctx->Allocate(str_len); | ||
| dst->len = str_len; | ||
| memset(dst->ptr, 0, str_len); | ||
| } | ||
|
|
||
| static const uint64_t FNV64_PRIME = 1099511628211UL; | ||
| static const uint64_t FNV64_SEED = 14695981039346656037UL; | ||
|
|
||
| static uint64_t FnvHash(const void* data, int32_t bytes, uint64_t hash) { | ||
| const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data); | ||
| while (bytes--) { | ||
| hash = (*ptr ^ hash) * FNV64_PRIME; | ||
| ++ptr; | ||
| } | ||
| return hash; | ||
| } | ||
|
|
||
| static uint64_t Hash(const IntVal& v) { | ||
| return FnvHash(&v.val, sizeof(int32_t), FNV64_SEED); | ||
| } | ||
|
|
||
| void HllUpdate(FunctionContext* ctx, const IntVal& src, StringVal* dst) { | ||
| if (src.is_null) return; | ||
| assert(dst != NULL); | ||
| assert(!dst->is_null); | ||
| assert(dst->len == pow(2, HLL_PRECISION)); | ||
| uint64_t hash_value = Hash(src); | ||
| if (hash_value != 0) { | ||
| // Use the lower bits to index into the number of streams and then | ||
| // find the first 1 bit after the index bits. | ||
| int idx = hash_value % dst->len; | ||
| uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_PRECISION) + 1; | ||
| dst->ptr[idx] = ::max(dst->ptr[idx], first_one_bit); | ||
| } | ||
| } | ||
|
|
||
| void HllMerge(FunctionContext* ctx, const StringVal& src, StringVal* dst) { | ||
| assert(dst != NULL); | ||
| assert(!dst->is_null); | ||
| assert(!src.is_null); | ||
| assert(dst->len == pow(2, HLL_PRECISION)); | ||
| assert(src.len == pow(2, HLL_PRECISION)); | ||
| for (int i = 0; i < src.len; ++i) { | ||
| dst->ptr[i] = ::max(dst->ptr[i], src.ptr[i]); | ||
| } | ||
| } | ||
|
|
||
| const StringVal HllSerialize(FunctionContext* ctx, const StringVal& src) { | ||
| if (src.is_null) return src; | ||
| // Copy intermediate state into memory owned by Impala and free allocated memory | ||
| StringVal result(ctx, src.len); | ||
| memcpy(result.ptr, src.ptr, src.len); | ||
| ctx->Free(src.ptr); | ||
| return result; | ||
| } | ||
|
|
||
| StringVal HllFinalize(FunctionContext* ctx, const StringVal& src) { | ||
| assert(!src.is_null); | ||
| assert(src.len == pow(2, HLL_PRECISION)); | ||
|
|
||
| const int num_streams = pow(2, HLL_PRECISION); | ||
| // Empirical constants for the algorithm. | ||
| float alpha = 0; | ||
| if (num_streams == 16) { | ||
| alpha = 0.673f; | ||
| } else if (num_streams == 32) { | ||
| alpha = 0.697f; | ||
| } else if (num_streams == 64) { | ||
| alpha = 0.709f; | ||
| } else { | ||
| alpha = 0.7213f / (1 + 1.079f / num_streams); | ||
| } | ||
|
|
||
| float harmonic_mean = 0; | ||
| int num_zero_registers = 0; | ||
| for (int i = 0; i < src.len; ++i) { | ||
| harmonic_mean += powf(2.0f, -src.ptr[i]); | ||
| if (src.ptr[i] == 0) ++num_zero_registers; | ||
| } | ||
| harmonic_mean = 1.0f / harmonic_mean; | ||
| int64_t estimate = alpha * num_streams * num_streams * harmonic_mean; | ||
|
|
||
| if (num_zero_registers != 0) { | ||
| // Estimated cardinality is too low. Hll is too inaccurate here, instead use | ||
| // linear counting. | ||
| estimate = num_streams * log(static_cast<float>(num_streams) / num_zero_registers); | ||
| } | ||
|
|
||
| // Free allocated memory | ||
| ctx->Free(src.ptr); | ||
|
|
||
| // Output the estimate as ascii string | ||
| stringstream out; | ||
| out << estimate; | ||
| string out_str = out.str(); | ||
| StringVal result_str(ctx, out_str.size()); | ||
| memcpy(result_str.ptr, out_str.c_str(), result_str.len); | ||
| return result_str; | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
|
|
||
| #ifndef IMPALA_UDF_UDF_DEBUG_H | ||
| #define IMPALA_UDF_UDF_DEBUG_H | ||
|
|
||
| #include "udf.h" | ||
|
|
||
| #include <string> | ||
| #include <sstream> | ||
|
|
||
| namespace impala_udf { | ||
|
|
||
| template<typename T> | ||
| inline std::string DebugString(const T& val) { | ||
| if (val.is_null) return "NULL"; | ||
| std::stringstream ss; | ||
| ss << val.val; | ||
| return ss.str(); | ||
| } | ||
|
|
||
| template<> | ||
| inline std::string DebugString(const StringVal& val) { | ||
| if (val.is_null) return "NULL"; | ||
| return std::string(reinterpret_cast<const char*>(val.ptr), val.len); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| #endif | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,180 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include "uda-sample.h" | ||
| #include <assert.h> | ||
| #include <sstream> | ||
|
|
||
| using namespace impala_udf; | ||
| using namespace std; | ||
|
|
||
| template <typename T> | ||
| StringVal ToStringVal(FunctionContext* context, const T& val) { | ||
| stringstream ss; | ||
| ss << val; | ||
| string str = ss.str(); | ||
| StringVal string_val(context, str.size()); | ||
| memcpy(string_val.ptr, str.c_str(), str.size()); | ||
| return string_val; | ||
| } | ||
|
|
||
| template <> | ||
| StringVal ToStringVal<DoubleVal>(FunctionContext* context, const DoubleVal& val) { | ||
| if (val.is_null) return StringVal::null(); | ||
| return ToStringVal(context, val.val); | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // This is a sample of implementing a COUNT aggregate function. | ||
| // --------------------------------------------------------------------------- | ||
| void CountInit(FunctionContext* context, BigIntVal* val) { | ||
| val->is_null = false; | ||
| val->val = 0; | ||
| } | ||
|
|
||
| void CountUpdate(FunctionContext* context, const IntVal& input, BigIntVal* val) { | ||
| if (input.is_null) return; | ||
| ++val->val; | ||
| } | ||
|
|
||
| void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) { | ||
| dst->val += src.val; | ||
| } | ||
|
|
||
| BigIntVal CountFinalize(FunctionContext* context, const BigIntVal& val) { | ||
| return val; | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // This is a sample of implementing a AVG aggregate function. | ||
| // --------------------------------------------------------------------------- | ||
| struct AvgStruct { | ||
| double sum; | ||
| int64_t count; | ||
| }; | ||
|
|
||
| // Initialize the StringVal intermediate to a zero'd AvgStruct | ||
| void AvgInit(FunctionContext* context, StringVal* val) { | ||
| val->is_null = false; | ||
| val->len = sizeof(AvgStruct); | ||
| val->ptr = context->Allocate(val->len); | ||
| memset(val->ptr, 0, val->len); | ||
| } | ||
|
|
||
| void AvgUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val) { | ||
| if (input.is_null) return; | ||
| assert(!val->is_null); | ||
| assert(val->len == sizeof(AvgStruct)); | ||
| AvgStruct* avg = reinterpret_cast<AvgStruct*>(val->ptr); | ||
| avg->sum += input.val; | ||
| ++avg->count; | ||
| } | ||
|
|
||
| void AvgMerge(FunctionContext* context, const StringVal& src, StringVal* dst) { | ||
| if (src.is_null) return; | ||
| const AvgStruct* src_avg = reinterpret_cast<const AvgStruct*>(src.ptr); | ||
| AvgStruct* dst_avg = reinterpret_cast<AvgStruct*>(dst->ptr); | ||
| dst_avg->sum += src_avg->sum; | ||
| dst_avg->count += src_avg->count; | ||
| } | ||
|
|
||
| // A serialize function is necesary to free the intermediate state allocation. We use the | ||
| // StringVal constructor to allocate memory owned by Impala, copy the intermediate state, | ||
| // and free the original allocation. Note that memory allocated by the StringVal ctor is | ||
| // not necessarily persisted across UDA function calls, which is why we don't use it in | ||
| // AvgInit(). | ||
| const StringVal AvgSerialize(FunctionContext* context, const StringVal& val) { | ||
| assert(!val.is_null); | ||
| StringVal result(context, val.len); | ||
| memcpy(result.ptr, val.ptr, val.len); | ||
| context->Free(val.ptr); | ||
| return result; | ||
| } | ||
|
|
||
| StringVal AvgFinalize(FunctionContext* context, const StringVal& val) { | ||
| assert(!val.is_null); | ||
| assert(val.len == sizeof(AvgStruct)); | ||
| AvgStruct* avg = reinterpret_cast<AvgStruct*>(val.ptr); | ||
| StringVal result; | ||
| if (avg->count == 0) { | ||
| result = StringVal::null(); | ||
| } else { | ||
| // Copies the result to memory owned by Impala | ||
| result = ToStringVal(context, avg->sum / avg->count); | ||
| } | ||
| context->Free(val.ptr); | ||
| return result; | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // This is a sample of implementing the STRING_CONCAT aggregate function. | ||
| // Example: select string_concat(string_col, ",") from table | ||
| // --------------------------------------------------------------------------- | ||
| // Delimiter to use if the separator is NULL. | ||
| static const StringVal DEFAULT_STRING_CONCAT_DELIM((uint8_t*)", ", 2); | ||
|
|
||
| void StringConcatInit(FunctionContext* context, StringVal* val) { | ||
| val->is_null = true; | ||
| } | ||
|
|
||
| void StringConcatUpdate(FunctionContext* context, const StringVal& str, | ||
| const StringVal& separator, StringVal* result) { | ||
| if (str.is_null) return; | ||
| if (result->is_null) { | ||
| // This is the first string, simply set the result to be the value. | ||
| uint8_t* copy = context->Allocate(str.len); | ||
| memcpy(copy, str.ptr, str.len); | ||
| *result = StringVal(copy, str.len); | ||
| return; | ||
| } | ||
|
|
||
| const StringVal* sep_ptr = separator.is_null ? &DEFAULT_STRING_CONCAT_DELIM : | ||
| &separator; | ||
|
|
||
| // We need to grow the result buffer and then append the new string and | ||
| // separator. | ||
| int new_size = result->len + sep_ptr->len + str.len; | ||
| result->ptr = context->Reallocate(result->ptr, new_size); | ||
| memcpy(result->ptr + result->len, sep_ptr->ptr, sep_ptr->len); | ||
| result->len += sep_ptr->len; | ||
| memcpy(result->ptr + result->len, str.ptr, str.len); | ||
| result->len += str.len; | ||
| } | ||
|
|
||
| void StringConcatMerge(FunctionContext* context, const StringVal& src, StringVal* dst) { | ||
| if (src.is_null) return; | ||
| StringConcatUpdate(context, src, ",", dst); | ||
| } | ||
|
|
||
| // A serialize function is necesary to free the intermediate state allocation. We use the | ||
| // StringVal constructor to allocate memory owned by Impala, copy the intermediate | ||
| // StringVal, and free the intermediate's memory. Note that memory allocated by the | ||
| // StringVal ctor is not necessarily persisted across UDA function calls, which is why we | ||
| // don't use it in StringConcatUpdate(). | ||
| const StringVal StringConcatSerialize(FunctionContext* context, const StringVal& val) { | ||
| if (val.is_null) return val; | ||
| StringVal result(context, val.len); | ||
| memcpy(result.ptr, val.ptr, val.len); | ||
| context->Free(val.ptr); | ||
| return result; | ||
| } | ||
|
|
||
| // Same as StringConcatSerialize(). | ||
| StringVal StringConcatFinalize(FunctionContext* context, const StringVal& val) { | ||
| if (val.is_null) return val; | ||
| StringVal result(context, val.len); | ||
| memcpy(result.ptr, val.ptr, val.len); | ||
| context->Free(val.ptr); | ||
| return result; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
|
|
||
| #ifndef SAMPLES_UDA_H | ||
| #define SAMPLES_UDA_H | ||
|
|
||
| #include "lib/udf.h" | ||
|
|
||
| using namespace impala_udf; | ||
|
|
||
| // Note: As of Impala 1.2, UDAs must have the same intermediate and result types (see the | ||
| // udf.h header for the full Impala UDA specification, which can be found at | ||
| // https://github.com/cloudera/impala/blob/master/be/src/udf/udf.h). Some UDAs naturally | ||
| // conform to this limitation, such as Count and StringConcat. However, other UDAs return | ||
| // a numeric value but use a custom intermediate struct type that must be stored in a | ||
| // StringVal or BufferVal, such as Variance. | ||
| // | ||
| // As a workaround for now, these UDAs that require an intermediate buffer use StringVal | ||
| // for the intermediate and result type. In the UDAs' finalize functions, the numeric | ||
| // result is serialized to an ASCII string (see the ToStringVal() utility function | ||
| // provided with these samples). The returned StringVal is then cast back to the correct | ||
| // numeric type (see the Usage examples below). | ||
| // | ||
| // This restriction will be lifted in Impala 2.0. | ||
|
|
||
|
|
||
| // This is an example of the COUNT aggregate function. | ||
| // | ||
| // Usage: > create aggregate function my_count(int) returns bigint | ||
| // location '/user/cloudera/libudasample.so' update_fn='CountUpdate'; | ||
| // > select my_count(col) from tbl; | ||
| void CountInit(FunctionContext* context, BigIntVal* val); | ||
| void CountUpdate(FunctionContext* context, const IntVal& input, BigIntVal* val); | ||
| void CountMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst); | ||
| BigIntVal CountFinalize(FunctionContext* context, const BigIntVal& val); | ||
|
|
||
| // This is an example of the AVG(double) aggregate function. This function needs to | ||
| // maintain two pieces of state, the current sum and the count. We do this using | ||
| // the BufferVal intermediate type. When this UDA is registered, it would specify | ||
| // 16 bytes (8 byte sum + 8 byte count) as the size for this buffer. | ||
| // | ||
| // Usage: > create aggregate function my_avg(double) returns string | ||
| // location '/user/cloudera/libudasample.so' update_fn='AvgUpdate'; | ||
| // > select cast(my_avg(col) as double) from tbl; | ||
| // | ||
| // TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal | ||
| // and the return type changed to DoubleVal in Impala 2.0 | ||
| void AvgInit(FunctionContext* context, StringVal* val); | ||
| void AvgUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val); | ||
| void AvgMerge(FunctionContext* context, const StringVal& src, StringVal* dst); | ||
| const StringVal AvgSerialize(FunctionContext* context, const StringVal& val); | ||
| StringVal AvgFinalize(FunctionContext* context, const StringVal& val); | ||
|
|
||
| // This is a sample of implementing the STRING_CONCAT aggregate function. | ||
| // | ||
| // Usage: > create aggregate function string_concat(string, string) returns string | ||
| // location '/user/cloudera/libudasample.so' update_fn='StringConcatUpdate'; | ||
| // > select string_concat(string_col, ",") from table; | ||
| void StringConcatInit(FunctionContext* context, StringVal* val); | ||
| void StringConcatUpdate(FunctionContext* context, const StringVal& arg1, | ||
| const StringVal& arg2, StringVal* val); | ||
| void StringConcatMerge(FunctionContext* context, const StringVal& src, StringVal* dst); | ||
| const StringVal StringConcatSerialize(FunctionContext* context, const StringVal& val); | ||
| StringVal StringConcatFinalize(FunctionContext* context, const StringVal& val); | ||
|
|
||
| // This is a example of the variance aggregate function. | ||
| // | ||
| // Usage: > create aggregate function var(double) returns string | ||
| // location '/user/cloudera/libudasample.so' update_fn='VarianceUpdate'; | ||
| // > select cast(var(col) as double) from tbl; | ||
| // | ||
| // TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal | ||
| // and the return type changed to DoubleVal in Impala 2.0 | ||
| void VarianceInit(FunctionContext* context, StringVal* val); | ||
| void VarianceUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val); | ||
| void VarianceMerge(FunctionContext* context, const StringVal& src, StringVal* dst); | ||
| const StringVal VarianceSerialize(FunctionContext* context, const StringVal& val); | ||
| StringVal VarianceFinalize(FunctionContext* context, const StringVal& val); | ||
|
|
||
| // An implementation of the Knuth online variance algorithm, which is also single pass and | ||
| // more numerically stable. | ||
| // | ||
| // Usage: > create aggregate function knuth_var(double) returns string | ||
| // location '/user/cloudera/libudasample.so' update_fn='KnuthVarianceUpdate'; | ||
| // > select cast(knuth_var(col) as double) from tbl; | ||
| // | ||
| // TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal | ||
| // and the return type changed to DoubleVal in Impala 2.0 | ||
| void KnuthVarianceInit(FunctionContext* context, StringVal* val); | ||
| void KnuthVarianceUpdate(FunctionContext* context, const DoubleVal& input, StringVal* val); | ||
| void KnuthVarianceMerge(FunctionContext* context, const StringVal& src, StringVal* dst); | ||
| const StringVal KnuthVarianceSerialize(FunctionContext* context, const StringVal& val); | ||
| StringVal KnuthVarianceFinalize(FunctionContext* context, const StringVal& val); | ||
|
|
||
| // The different steps of the UDA are composable. In this case, we'the UDA will use the | ||
| // other steps from the Knuth variance computation. | ||
| // | ||
| // Usage: > create aggregate function stddev(double) returns string | ||
| // location '/user/cloudera/libudasample.so' update_fn='KnuthVarianceUpdate' | ||
| // finalize_fn="StdDevFinalize"; | ||
| // > select cast(stddev(col) as double) from tbl; | ||
| // | ||
| // TODO: The StringVal intermediate type should be replaced by a prealloacted BufferVal | ||
| // and the return type changed to DoubleVal in Impala 2.0 | ||
| StringVal StdDevFinalize(FunctionContext* context, const StringVal& val); | ||
|
|
||
| // Utility function for serialization to StringVal | ||
| // TODO: this will be unnecessary in Impala 2.0, when we will no longer have to serialize | ||
| // results to StringVals in order to match the intermediate type | ||
| template <typename T> | ||
| StringVal ToStringVal(FunctionContext* context, const T& val); | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,214 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include "udf-sample.h" | ||
|
|
||
| #include <cctype> | ||
| #include <cmath> | ||
| #include <string> | ||
|
|
||
| // In this sample we are declaring a UDF that adds two ints and returns an int. | ||
| IntVal AddUdf(FunctionContext* context, const IntVal& arg1, const IntVal& arg2) { | ||
| if (arg1.is_null || arg2.is_null) return IntVal::null(); | ||
| return IntVal(arg1.val + arg2.val); | ||
| } | ||
|
|
||
| // Multiple UDFs can be defined in the same file | ||
|
|
||
| BooleanVal FuzzyEquals(FunctionContext* ctx, const DoubleVal& x, const DoubleVal& y) { | ||
| const double EPSILON = 0.000001f; | ||
| if (x.is_null || y.is_null) return BooleanVal::null(); | ||
| double delta = fabs(x.val - y.val); | ||
| return BooleanVal(delta < EPSILON); | ||
| } | ||
|
|
||
| // Check if the input string has any occurrences of the letters (a,e,i,o,u). | ||
| // Case-insensitive, so also detects (A,E,I,O,U). | ||
| BooleanVal HasVowels(FunctionContext* context, const StringVal& input) { | ||
| if (input.is_null) return BooleanVal::null(); | ||
|
|
||
| int index; | ||
| uint8_t *ptr; | ||
|
|
||
| for (ptr = input.ptr, index = 0; index < input.len; index++, ptr++) { | ||
| uint8_t c = tolower(*ptr); | ||
| if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') { | ||
| return BooleanVal(true); | ||
| } | ||
| } | ||
| return BooleanVal(false); | ||
| } | ||
|
|
||
| // Count all occurrences of the letters (a,e,i,o,u) in the input string. | ||
| // Case-insensitive, so also counts (A,E,I,O,U). | ||
| IntVal CountVowels(FunctionContext* context, const StringVal& arg1) { | ||
| if (arg1.is_null) return IntVal::null(); | ||
|
|
||
| int count; | ||
| int index; | ||
| uint8_t *ptr; | ||
|
|
||
| for (ptr = arg1.ptr, count = 0, index = 0; index < arg1.len; index++, ptr++) { | ||
| uint8_t c = tolower(*ptr); | ||
| if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') { | ||
| count++; | ||
| } | ||
| } | ||
| return IntVal(count); | ||
| } | ||
|
|
||
| // Remove all occurrences of the letters (a,e,i,o,u) from the input string. | ||
| // Case-insensitive, so also removes (A,E,I,O,U). | ||
| StringVal StripVowels(FunctionContext* context, const StringVal& arg1) { | ||
| if (arg1.is_null) return StringVal::null(); | ||
|
|
||
| int index; | ||
| std::string original((const char *)arg1.ptr,arg1.len); | ||
| std::string shorter(""); | ||
|
|
||
| for (index = 0; index < original.length(); index++) { | ||
| uint8_t c = original[index]; | ||
| uint8_t l = tolower(c); | ||
|
|
||
| if (l == 'a' || l == 'e' || l == 'i' || l == 'o' || l == 'u') { | ||
| continue; | ||
| } | ||
| else { | ||
| shorter.append(1, (char)c); | ||
| } | ||
| } | ||
| // The modified string is stored in 'shorter', which is destroyed when this function | ||
| // ends. We need to make a string val and copy the contents. | ||
| // NB: Only the version of the ctor that takes a context object allocates new memory. | ||
| StringVal result(context, shorter.size()); | ||
| memcpy(result.ptr, shorter.c_str(), shorter.size()); | ||
| return result; | ||
| } | ||
|
|
||
| // In the prepare function, allocate an IntVal and set it as the shared state. This | ||
| // IntVal will be set to the result to be returned, i.e. the argument if it's constant | ||
| // and null otherwise. | ||
| void ReturnConstantArgPrepare( | ||
| FunctionContext* context, FunctionContext::FunctionStateScope scope) { | ||
| // UDFs should check the version to avoid unimplemented functions from being called | ||
| if (context->version() < FunctionContext::v1_3) { | ||
| context->SetError("This UDF can only be used with Impala 1.3 or higher"); | ||
| return; | ||
| } | ||
| // TODO: this can be FRAGMENT_LOCAL once it's implemented since we're creating | ||
| // read-only state | ||
| if (scope == FunctionContext::THREAD_LOCAL) { | ||
| // Get the constant value of the 'const_val' argument in ReturnConstantArg(). If this | ||
| // value is not constant, 'arg' will be NULL. | ||
| IntVal* arg = reinterpret_cast<IntVal*>(context->GetConstantArg(0)); | ||
| // Allocate shared state to store 'arg' or a null IntVal | ||
| IntVal* state = reinterpret_cast<IntVal*>(context->Allocate(sizeof(IntVal))); | ||
| *state = (arg != NULL) ? *arg : IntVal::null(); | ||
| // Set the shared state in the function context | ||
| context->SetFunctionState(scope, state); | ||
| } | ||
| } | ||
|
|
||
| // Retreives and returns the shared state set in the prepare function | ||
| IntVal ReturnConstantArg(FunctionContext* context, const IntVal& const_val) { | ||
| IntVal* state = reinterpret_cast<IntVal*>( | ||
| context->GetFunctionState(FunctionContext::THREAD_LOCAL)); | ||
| return *state; | ||
| } | ||
|
|
||
| // Cleans up the shared state | ||
| void ReturnConstantArgClose( | ||
| FunctionContext* context, FunctionContext::FunctionStateScope scope) { | ||
| if (scope == FunctionContext::THREAD_LOCAL) { | ||
| // Retreive and deallocate the shared state | ||
| void* state = context->GetFunctionState(scope); | ||
| context->Free(reinterpret_cast<uint8_t*>(state)); | ||
| context->SetFunctionState(scope, NULL); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| BooleanVal Identity(FunctionContext* context, const BooleanVal& arg) { return arg; } | ||
|
|
||
| TinyIntVal Identity(FunctionContext* context, const TinyIntVal& arg) { return arg; } | ||
|
|
||
| SmallIntVal Identity(FunctionContext* context, const SmallIntVal& arg) { return arg; } | ||
|
|
||
| IntVal Identity(FunctionContext* context, const IntVal& arg) { return arg; } | ||
|
|
||
| BigIntVal Identity(FunctionContext* context, const BigIntVal& arg) { return arg; } | ||
|
|
||
| FloatVal Identity(FunctionContext* context, const FloatVal& arg) { return arg; } | ||
|
|
||
| DoubleVal Identity(FunctionContext* context, const DoubleVal& arg) { return arg; } | ||
|
|
||
| StringVal Identity(FunctionContext* context, const StringVal& arg) { return arg; } | ||
|
|
||
| TimestampVal Identity(FunctionContext* context, const TimestampVal& arg) { return arg; } | ||
|
|
||
| DecimalVal Identity(FunctionContext* context, const DecimalVal& arg) { return arg; } | ||
|
|
||
| IntVal AlmostAllTypes( | ||
| FunctionContext* context, const StringVal& string, const BooleanVal& boolean, | ||
| const TinyIntVal& tiny_int, const SmallIntVal& small_int, const IntVal& int_val, | ||
| const BigIntVal& big_int, const FloatVal& float_val, const DoubleVal& double_val | ||
| ) { | ||
| int result = string.len + boolean.val + tiny_int.val + small_int.val + int_val.val | ||
| + big_int.val + static_cast<int64_t>(float_val.val) | ||
| + static_cast<int64_t>(double_val.val); | ||
| return IntVal(result); | ||
| } | ||
|
|
||
| IntVal AllTypes( | ||
| FunctionContext* context, const StringVal& string, const BooleanVal& boolean, | ||
| const TinyIntVal& tiny_int, const SmallIntVal& small_int, const IntVal& int_val, | ||
| const BigIntVal& big_int, const FloatVal& float_val, const DoubleVal& double_val, | ||
| const DecimalVal& decimal) { | ||
| int result = string.len + boolean.val + tiny_int.val + small_int.val + int_val.val | ||
| + big_int.val + static_cast<int64_t>(float_val.val) | ||
| + static_cast<int64_t>(double_val.val) + decimal.val4; | ||
| return IntVal(result); | ||
| } | ||
|
|
||
|
|
||
| IntVal TwoArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2) { | ||
| return IntVal(v1.val + v2.val); | ||
| } | ||
|
|
||
| IntVal FourArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2, | ||
| const IntVal& v3, const IntVal& v4) { | ||
| return IntVal(v1.val + v2.val + v3.val + v4.val); | ||
| } | ||
|
|
||
| IntVal FiveArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2, | ||
| const IntVal& v3, const IntVal& v4, const IntVal& v5) { | ||
| return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val); | ||
| } | ||
|
|
||
| IntVal SixArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2, | ||
| const IntVal& v3, const IntVal& v4, const IntVal& v5, const IntVal& v6) { | ||
| return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val + v6.val); | ||
| } | ||
|
|
||
| IntVal SevenArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2, | ||
| const IntVal& v3, const IntVal& v4, const IntVal& v5, const IntVal& v6, | ||
| const IntVal& v7) { | ||
| return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val + v6.val + v7.val); | ||
| } | ||
|
|
||
| IntVal EightArgs(FunctionContext* context, const IntVal& v1, const IntVal& v2, | ||
| const IntVal& v3, const IntVal& v4, const IntVal& v5, const IntVal& v6, | ||
| const IntVal& v7, const IntVal& v8) { | ||
| return IntVal(v1.val + v2.val + v3.val + v4.val + v5.val + v6.val + v7.val + v8.val); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
|
|
||
| #ifndef SAMPLES_UDF_H | ||
| #define SAMPLES_UDF_H | ||
|
|
||
| #include "lib/udf.h" | ||
|
|
||
| using namespace impala_udf; | ||
|
|
||
| // Usage: > create function add(int, int) returns int | ||
| // location '/user/cloudera/libudfsample.so' SYMBOL='AddUdf'; | ||
| // > select add(1, 2); | ||
| IntVal AddUdf(FunctionContext* context, const IntVal& arg1, const IntVal& arg2); | ||
|
|
||
| // Returns true if x is approximately equal to y. | ||
| // Usage: > create function fuzzy_equals(double, double) returns boolean | ||
| // location '/user/cloudera/libudfsample.so' SYMBOL='FuzzyEquals'; | ||
| // > select fuzzy_equals(1, 1.00000001); | ||
| BooleanVal FuzzyEquals(FunctionContext* context, const DoubleVal& x, const DoubleVal& y); | ||
|
|
||
| // Perform tests, calculations, and transformations | ||
| // on a string value, using the set of letters 'aeiou'. | ||
|
|
||
| // Usage: > create function hasvowels(string) returns boolean | ||
| // location '/user/cloudera/libudfsample.so' SYMBOL='HasVowels'; | ||
| // > select hasvowels('banana'); | ||
| // > select hasvowels('grr hm shhh'); | ||
| // > select hasvowels(c1) from t1; | ||
| BooleanVal HasVowels(FunctionContext* context, const StringVal& input); | ||
|
|
||
|
|
||
| // Usage: > create function countvowels(string) returns int | ||
| // location '/user/cloudera/libudfsample.so' SYMBOL='CountVowels'; | ||
| // > select countvowels('abracadabra hocus pocus'); | ||
| // > select countvowels(c1) from t1; | ||
| IntVal CountVowels(FunctionContext* context, const StringVal& arg1); | ||
|
|
||
| // Usage: > create function stripvowels(string) returns string | ||
| // location '/user/cloudera/libudfsample.so' SYMBOL='StripVowels'; | ||
| // > select stripvowels('colour color'); | ||
| // > select stripvowels(c1) from t1; | ||
| StringVal StripVowels(FunctionContext* context, const StringVal& arg1); | ||
|
|
||
| // If 'val' is constant, returns 'val', otherwise returns null. This is a simple toy UDF | ||
| // demonstrating how to use prepare and close functions to maintain shared state. | ||
| // Requires Impala 1.3 or higher. | ||
| // Usage: > create function constantarg(int) returns int | ||
| // location '/user/cloudera/libudfsample.so' symbol='ReturnConstantArg' | ||
| // prepare_fn='ReturnConstantArgPrepare' close_fn='ReturnConstantArgClose'; | ||
| // > select constantarg(1 + 1); | ||
| // > select constantarg(c1) from t1 limit 1; | ||
| IntVal ReturnConstantArg(FunctionContext* context, const IntVal& val); | ||
| void ReturnConstantArgPrepare( | ||
| FunctionContext* context, FunctionContext::FunctionStateScope scope); | ||
| void ReturnConstantArgClose( | ||
| FunctionContext* context, FunctionContext::FunctionStateScope scope); | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,146 @@ | ||
| // Copyright 2012 Cloudera Inc. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #include <assert.h> | ||
| #include <math.h> | ||
| #include <algorithm> | ||
| #include <sstream> | ||
| #include <iostream> | ||
| #include "lib/udf.h" | ||
|
|
||
| #include "uda-sample.h" | ||
|
|
||
| using namespace std; | ||
| using namespace impala_udf; | ||
|
|
||
| // An implementation of a simple single pass variance algorithm. A standard UDA must | ||
| // be single pass (i.e. does not scan the table more than once), so the most canonical | ||
| // two pass approach is not practical. | ||
| // This algorithms suffers from numerical precision issues if the input values are | ||
| // large due to floating point rounding. | ||
| struct VarianceState { | ||
| // Sum of all input values. | ||
| double sum; | ||
| // Sum of the square of all input values. | ||
| double sum_squared; | ||
| // The number of input values. | ||
| int64_t count; | ||
| }; | ||
|
|
||
| void VarianceInit(FunctionContext* ctx, StringVal* dst) { | ||
| dst->is_null = false; | ||
| dst->len = sizeof(VarianceState); | ||
| dst->ptr = ctx->Allocate(dst->len); | ||
| memset(dst->ptr, 0, dst->len); | ||
| } | ||
|
|
||
| void VarianceUpdate(FunctionContext* ctx, const DoubleVal& src, StringVal* dst) { | ||
| if (src.is_null) return; | ||
| VarianceState* state = reinterpret_cast<VarianceState*>(dst->ptr); | ||
| state->sum += src.val; | ||
| state->sum_squared += src.val * src.val; | ||
| ++state->count; | ||
| } | ||
|
|
||
| void VarianceMerge(FunctionContext* ctx, const StringVal& src, StringVal* dst) { | ||
| VarianceState* src_state = reinterpret_cast<VarianceState*>(src.ptr); | ||
| VarianceState* dst_state = reinterpret_cast<VarianceState*>(dst->ptr); | ||
| dst_state->sum += src_state->sum; | ||
| dst_state->sum_squared += src_state->sum_squared; | ||
| dst_state->count += src_state->count; | ||
| } | ||
|
|
||
| // A serialize function is necessary to free the intermediate state allocation. | ||
| const StringVal VarianceSerialize(FunctionContext* ctx, const StringVal& src) { | ||
| StringVal result(ctx, src.len); | ||
| memcpy(result.ptr, src.ptr, src.len); | ||
| ctx->Free(src.ptr); | ||
| return result; | ||
| } | ||
|
|
||
| StringVal VarianceFinalize(FunctionContext* ctx, const StringVal& src) { | ||
| VarianceState state = *reinterpret_cast<VarianceState*>(src.ptr); | ||
| ctx->Free(src.ptr); | ||
| if (state.count == 0 || state.count == 1) return StringVal::null(); | ||
| double mean = state.sum / state.count; | ||
| double variance = | ||
| (state.sum_squared - state.sum * state.sum / state.count) / (state.count - 1); | ||
| return ToStringVal(ctx, variance); | ||
| } | ||
|
|
||
| struct KnuthVarianceState { | ||
| int64_t count; | ||
| double mean; | ||
| double m2; | ||
| }; | ||
|
|
||
| void KnuthVarianceInit(FunctionContext* ctx, StringVal* dst) { | ||
| dst->is_null = false; | ||
| dst->len = sizeof(KnuthVarianceState); | ||
| dst->ptr = ctx->Allocate(dst->len); | ||
| memset(dst->ptr, 0, dst->len); | ||
| } | ||
|
|
||
| void KnuthVarianceUpdate(FunctionContext* ctx, const DoubleVal& src, StringVal* dst) { | ||
| if (src.is_null) return; | ||
| KnuthVarianceState* state = reinterpret_cast<KnuthVarianceState*>(dst->ptr); | ||
| double temp = 1 + state->count; | ||
| double delta = src.val - state->mean; | ||
| double r = delta / temp; | ||
| state->mean += r; | ||
| state->m2 += state->count * delta * r; | ||
| state->count = temp; | ||
| } | ||
|
|
||
| void KnuthVarianceMerge(FunctionContext* ctx, const StringVal& src, StringVal* dst) { | ||
| KnuthVarianceState* src_state = reinterpret_cast<KnuthVarianceState*>(src.ptr); | ||
| KnuthVarianceState* dst_state = reinterpret_cast<KnuthVarianceState*>(dst->ptr); | ||
| if (src_state->count == 0) return; | ||
| double delta = dst_state->mean - src_state->mean; | ||
| double sum_count = dst_state->count + src_state->count; | ||
| dst_state->mean = src_state->mean + delta * (dst_state->count / sum_count); | ||
| dst_state->m2 = (src_state->m2) + dst_state->m2 + | ||
| (delta * delta) * (src_state->count * dst_state->count / sum_count); | ||
| dst_state->count = sum_count; | ||
| } | ||
|
|
||
| // Same as VarianceSerialize(). Create a wrapper function so automatic symbol resolution | ||
| // still works. | ||
| const StringVal KnuthVarianceSerialize(FunctionContext* ctx, const StringVal& state_sv) { | ||
| return VarianceSerialize(ctx, state_sv); | ||
| } | ||
|
|
||
| // TODO: this can be used as the actual variance finalize function once the return type | ||
| // doesn't need to match the intermediate type in Impala 2.0. | ||
| DoubleVal KnuthVarianceFinalize(const StringVal& state_sv) { | ||
| KnuthVarianceState* state = reinterpret_cast<KnuthVarianceState*>(state_sv.ptr); | ||
| if (state->count == 0 || state->count == 1) return DoubleVal::null(); | ||
| double variance_n = state->m2 / state->count; | ||
| double variance = variance_n * state->count / (state->count - 1); | ||
| return DoubleVal(variance); | ||
| } | ||
|
|
||
| StringVal KnuthVarianceFinalize(FunctionContext* ctx, const StringVal& src) { | ||
| StringVal result = ToStringVal(ctx, KnuthVarianceFinalize(src)); | ||
| ctx->Free(src.ptr); | ||
| return result; | ||
| } | ||
|
|
||
| StringVal StdDevFinalize(FunctionContext* ctx, const StringVal& src) { | ||
| DoubleVal variance = KnuthVarianceFinalize(src); | ||
| ctx->Free(src.ptr); | ||
| if (variance.is_null) return StringVal::null(); | ||
| return ToStringVal(ctx, sqrt(variance.val)); | ||
| } | ||
|
|