In [1]:
%%writefile external_table.hql

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar;

USE mydb;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.max.dynamic.partitions.pernode=1000;

Writing external_table.hql


In [2]:
%%writefile -a external_table.hql

DROP TABLE IF EXISTS mydb.posts_sample_external;

CREATE EXTERNAL TABLE mydb.posts_sample_external (
    `Id` INT,
    `CreationDate` STRING,
    `Year` STRING,
    `Month` STRING
)
ROW FORMAT 
 SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
 WITH SERDEPROPERTIES(
  "input.regex"='.*(?=.*\\bId\=\"(\\d+)\")(?=.*\\bCreationDate\=\"(\\d+\-\\d+\-\\d+T\\d+\:\\d+\:\\d+\.\\d+)\")(?=.*\\bCreationDate\=\"(\\d+)\-\\d+\-\\d+T\\d+\:\\d+\:\\d+\.\\d+\")(?=.*\\bCreationDate\=\"(\\d+\-\\d+)\-\\d+T\\d+\:\\d+\:\\d+\.\\d+\").*$'
 )
STORED AS TEXTFILE
LOCATION '/data/stackexchange1000/posts';

Appending to external_table.hql


In [3]:
! hive -f external_table.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar]
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar]
OK
Time taken: 1.072 seconds
OK
Time taken: 0.14 seconds
OK
Time taken: 0.974 seconds


In [4]:
%%writefile query.hql

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
USE mydb;

SELECT Id, CreationDate, Year, Month
FROM mydb.posts_sample_external
LIMIT 10;

Writing query.hql


In [5]:
! hive -f query.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar]
OK
Time taken: 1.092 seconds
OK
NULL	NULL	NULL	NULL
1394	2008-08-04T16:38:03.667	2008	2008-08
3543	2008-08-06T15:24:00.787	2008	2008-08
4521	2008-08-07T08:22:27.440	2008	2008-08
8689	2008-08-12T11:23:28.733	2008	2008-08
9062	2008-08-12T17:20:41.993	2008	2008-08
14671	2008-08-18T14:18:22.310	2008	2008-08
16307	2008-08-19T14:45:07.997	2008	2008-08
18780	2008-08-20T20:44:27.947	2008	2008-08
18929	2008-08-20T21:49:23.203	2008	2008-08
Time taken: 1.659 seconds, Fetched: 10 row(s)


In [6]:
%%writefile managed_table.hql

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar;

USE mydb;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=2000;
set hive.exec.max.dynamic.partitions.pernode=1000;

Writing managed_table.hql


In [7]:
%%writefile -a managed_table.hql

DROP TABLE IF EXISTS mydb.posts_sample;

CREATE TABLE mydb.posts_sample (
    `Id` INT,
    `CreationDate` STRING
) 
PARTITIONED BY (`Year` STRING,`Month` STRING);

INSERT OVERWRITE TABLE mydb.posts_sample PARTITION(`Year`,`Month`) 
SELECT `Id`,`CreationDate`,`Year`,`Month` FROM mydb.posts_sample_external;

Appending to managed_table.hql


In [8]:
! hive -f managed_table.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar]
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-serde.jar]
OK
Time taken: 1.138 seconds
OK
Time taken: 0.151 seconds
OK
Time taken: 0.963 seconds
Query ID = jovyan_20180306110505_b96ea946-e8e9-47a1-ae6c-4ff2e8ccb9dd
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1520330679961_0004, Tracking URL = http://d80538813e88:8088/proxy/application_1520330679961_0004/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1520330679961_0004
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2018-03-06 11:06:05,553 Stage

2018-03-06 12:10:29,532 Stage-1 map = 61%,  reduce = 0%, Cumulative CPU 3875.09 sec
2018-03-06 12:11:29,718 Stage-1 map = 61%,  reduce = 0%, Cumulative CPU 3935.41 sec
2018-03-06 12:11:47,370 Stage-1 map = 62%,  reduce = 0%, Cumulative CPU 3953.48 sec
2018-03-06 12:12:47,583 Stage-1 map = 62%,  reduce = 0%, Cumulative CPU 4013.7 sec
2018-03-06 12:12:53,816 Stage-1 map = 63%,  reduce = 0%, Cumulative CPU 4019.72 sec
2018-03-06 12:13:54,023 Stage-1 map = 63%,  reduce = 0%, Cumulative CPU 4079.95 sec
2018-03-06 12:14:05,445 Stage-1 map = 64%,  reduce = 0%, Cumulative CPU 4091.98 sec
2018-03-06 12:15:05,738 Stage-1 map = 64%,  reduce = 0%, Cumulative CPU 4140.3 sec
2018-03-06 12:16:06,076 Stage-1 map = 64%,  reduce = 0%, Cumulative CPU 4212.54 sec
2018-03-06 12:16:12,303 Stage-1 map = 65%,  reduce = 0%, Cumulative CPU 4218.56 sec
2018-03-06 12:17:12,488 Stage-1 map = 65%,  reduce = 0%, Cumulative CPU 4273.02 sec
2018-03-06 12:17:23,905 Stage-1 map = 66%,  reduce = 0%, Cumulative CPU 4291.1

	Loading partition {year=2013, month=2013-10}
	Loading partition {year=2015, month=2015-04}
	Loading partition {year=2015, month=2015-07}
	Loading partition {year=2008, month=2008-10}
	Loading partition {year=2015, month=2015-10}
	Loading partition {year=2015, month=2015-01}
	Loading partition {year=2013, month=2013-12}
	Loading partition {year=2010, month=2010-05}
	Loading partition {year=2010, month=2010-08}
	Loading partition {year=2016, month=2016-09}
	Loading partition {year=2009, month=2009-09}
	Loading partition {year=2013, month=2013-03}
	Loading partition {year=__HIVE_DEFAULT_PARTITION__, month=__HIVE_DEFAULT_PARTITION__}
	Loading partition {year=2012, month=2012-06}
	Loading partition {year=2012, month=2012-01}
	Loading partition {year=2011, month=2011-11}
	Loading partition {year=2009, month=2009-10}
	Loading partition {year=2016, month=2016-10}
	Loading partition {year=2014, month=2014-01}
	Loading partition {year=2009, month=2009-04}
	Loading partition {year=2016, month=20

Partition mydb.posts_sample{year=2014, month=2014-03} stats: [numFiles=1, numRows=624, totalSize=20592, rawDataSize=19968]
Partition mydb.posts_sample{year=2014, month=2014-04} stats: [numFiles=1, numRows=593, totalSize=19569, rawDataSize=18976]
Partition mydb.posts_sample{year=2014, month=2014-05} stats: [numFiles=1, numRows=541, totalSize=17853, rawDataSize=17312]
Partition mydb.posts_sample{year=2014, month=2014-06} stats: [numFiles=1, numRows=487, totalSize=16071, rawDataSize=15584]
Partition mydb.posts_sample{year=2014, month=2014-07} stats: [numFiles=1, numRows=536, totalSize=17688, rawDataSize=17152]
Partition mydb.posts_sample{year=2014, month=2014-08} stats: [numFiles=1, numRows=495, totalSize=16335, rawDataSize=15840]
Partition mydb.posts_sample{year=2014, month=2014-09} stats: [numFiles=1, numRows=502, totalSize=16566, rawDataSize=16064]
Partition mydb.posts_sample{year=2014, month=2014-10} stats: [numFiles=1, numRows=527, totalSize=17391, rawDataSize=16864]
Partition mydb.p

In [13]:
%%writefile last_query.hql

ADD JAR /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
USE mydb;

SELECT Year, Month, COUNT(Id)
FROM mydb.posts_sample
GROUP BY Year, Month;

Overwriting last_query.hql


In [14]:
! hive -f last_query.hql


Logging initialized using configuration in jar:file:/usr/local/apache-hive-1.1.0-bin/lib/hive-common-1.1.0.jar!/hive-log4j.properties
Added [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar] to class path
Added resources: [/opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar]
OK
Time taken: 1.12 seconds
Query ID = jovyan_20180306131919_f44de1a6-d5a5-427e-ab35-0e0a25fb649f
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1520330679961_0005, Tracking URL = http://d80538813e88:8088/proxy/application_1520330679961_0005/
Kill Command = /opt/hadoop/bin/hadoop job  -kill job_1520330679961_0005
Hadoop job information for 