Merge pull request apache#115 from airbnb/csv_improvements

Cherrypick csv upload changes (apache#5268)
kristw · Sep 19, 2018 · fc07ddb · fc07ddb
2 parents 901ea8a + 866e644
commit fc07ddb
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 3 deletions.
diff --git a/superset/config.py b/superset/config.py
@@ -315,6 +315,10 @@ class CeleryConfig(object):
 # contain all the external tables
 CSV_TO_HIVE_UPLOAD_DIRECTORY = 'EXTERNAL_HIVE_TABLES/'
 
+# The namespace within hive where the tables created from
+# uploading CSVs will be stored.
+UPLOADED_CSV_HIVE_NAMESPACE = None
+
 # A dictionary of items that gets merged into the Jinja context for
 # SQL Lab. The existing context gets updated with this dictionary,
 # meaning values for existing keys get overwritten by the content of this

diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py
@@ -910,9 +910,26 @@ def get_column_names(filepath):
                 return next(unicodecsv.reader(f, encoding='utf-8-sig'))
 
         table_name = form.name.data
-        filename = form.csv_file.data.filename
+        schema_name = form.schema.data
+
+        if config.get('UPLOADED_CSV_HIVE_NAMESPACE'):
+            if '.' in table_name or schema_name:
+                raise Exception(
+                    "You can't specify a namespace. "
+                    'All tables will be uploaded to the `{}` namespace'.format(
+                        config.get('HIVE_NAMESPACE')))
+            table_name = '{}.{}'.format(
+                config.get('UPLOADED_CSV_HIVE_NAMESPACE'), table_name)
+        else:
+            if '.' in table_name and schema_name:
+                raise Exception(
+                    "You can't specify a namespace both in the name of the table "
+                    'and in the schema field. Please remove one')
+            if schema_name:
+                table_name = '{}.{}'.format(schema_name, table_name)
 
-        bucket_path = app.config['CSV_TO_HIVE_UPLOAD_S3_BUCKET']
+        filename = form.csv_file.data.filename
+        bucket_path = config['CSV_TO_HIVE_UPLOAD_S3_BUCKET']
 
         if not bucket_path:
             logging.info('No upload bucket specified')
@@ -933,7 +950,7 @@ def get_column_names(filepath):
         s3.upload_file(
             upload_path, bucket_path,
             os.path.join(upload_prefix, table_name, filename))
-        sql = """CREATE EXTERNAL TABLE {table_name} ( {schema_definition} )
+        sql = """CREATE TABLE {table_name} ( {schema_definition} )
             ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS
             TEXTFILE LOCATION '{location}'
             tblproperties ('skip.header.line.count'='1')""".format(**locals())