Create and Query wdi_gs

In [1]:
DROP TABLE IF EXISTS wdi_gs;

In [2]:
CREATE EXTERNAL TABLE wdi_gs
(year INTEGER, countryName STRING, countryCode STRING, indicatorName STRING, indicatorCode STRING, indicatorValue FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
LOCATION 'gs://jarvis_data_eng_david/datasets/wdi_2016'
TBLPROPERTIES ("skip.header.line.count"="1");

In [3]:
DESCRIBE FORMATTED wdi_gs;

In [4]:
SELECT count(countryName) FROM wdi_gs;

Create and Query wdi_csv_text

In [6]:
DROP TABLE IF EXISTS wdi_csv_text;
CREATE EXTERNAL TABLE wdi_csv_text
(year INTEGER, countryName STRING, countryCode STRING, indicatorName STRING, indicatorCode STRING, indicatorValue FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
LOCATION 'hdfs:///user/david/hive/wdi/wdi_csv_text';

In [7]:
INSERT OVERWRITE TABLE wdi_csv_text
SELECT * FROM wdi_gs;

In [8]:
SELECT count(countryName) FROM wdi_csv_text;

In [9]:
SELECT distinct(indicatorcode)
FROM wdi_csv_text
ORDER BY indicatorcode
LIMIT 20;

Create and Query wdi_gs_debug

In [11]:
DROP TABLE IF EXISTS wdi_gs_debug;
CREATE EXTERNAL TABLE wdi_gs_debug
(line STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
LOCATION 'hdfs:///user/david/hive/wdi/wdi_gs_debug';

In [12]:
SELECT * FROM wdi_gs_debug WHERE line like "%\(\% of urban population\)\"%"

Create/ Query wdi_opencsv_gs and wdi_opencsv_text

In [14]:
DROP TABLE IF EXISTS wdi_opencsv_gs;
CREATE EXTERNAL TABLE wdi_opencsv_gs
(year INTEGER, countryName STRING, countryCode STRING, indicatorName STRING, indicatorCode STRING, indicatorValue FLOAT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE
LOCATION 'gs://jarvis_data_eng_david/datasets/wdi_2016';

In [15]:
DROP TABLE IF EXISTS wdi_opencsv_text;
CREATE EXTERNAL TABLE wdi_opencsv_text
(year INTEGER, countryName STRING, countryCode STRING, indicatorName STRING, indicatorCode STRING, indicatorValue FLOAT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE
LOCATION 'hdfs:///user/david/hive/wdi/wdi_opencsv_text';

In [16]:
INSERT OVERWRITE TABLE wdi_opencsv_text
SELECT * FROM wdi_opencsv_gs;

In [17]:
SELECT distinct(indicatorcode) 
FROM wdi_opencsv_text 
LIMIT 20;

In [18]:
SELECT count(countryName) FROM wdi_opencsv_text;

In [19]:
SELECT count(countryName) FROM wdi_csv_text;

In [20]:
DESCRIBE FORMATTED wdi_opencsv_text;

In [21]:
DESCRIBE FORMATTED wdi_csv_text;

OpenCSVSerde limitaion

In [23]:
DROP VIEW IF EXISTS wdi_opencsv_text_view;
CREATE VIEW wdi_opencsv_text_view AS
SELECT CAST(year AS INTEGER) , countryName, countryCode, indicatorName, indicatorCode, CAST(indicatorValue AS FLOAT)   
FROM wdi_opencsv_text;

In [24]:
DESCRIBE FORMATTED wdi_opencsv_text_view;

2015 Canada GDP Growth HQL

In [26]:

SELECT countryname, indicatorname, indicatorcode  
FROM wdi_opencsv_text  
WHERE countryname= "Canada" and indicatorname like "GDP growth (annual %)"
GROUP BY countryname, indicatorname, indicatorcode

In [27]:
SELECT indicatorvalue, countryname, year
FROM wdi_opencsv_text  
WHERE countryname= "Canada" and indicatorname like "GDP growth (annual %)" and year= "2015";

Hive Partitions

In [29]:
DROP TABLE IF EXISTS wdi_opencsv_text_partitions;
CREATE EXTERNAL TABLE wdi_opencsv_text_partitions
(countryName STRING, countryCode STRING, indicatorName STRING, indicatorCode STRING, indicatorValue String)
PARTITIONED BY (year String)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
STORED AS TEXTFILE
LOCATION 'hdfs:///user/david/hive/wdi/wdi_opencsv_text_partitions';

In [30]:
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.stats.column.autogather=false;
FROM wdi_opencsv_text
INSERT OVERWRITE TABLE wdi_opencsv_text_partitions
PARTITION(year)
SELECT countryName, countryCode, indicatorName, indicatorCode, indicatorValue, year;

In [31]:
SELECT indicatorvalue, countryname, year
FROM wdi_opencsv_text_partitions
WHERE countryname= "Canada" and indicatorname like "GDP growth (annual %)" and year= "2015";

Columnar File Optimization

In [33]:
DROP TABLE IF EXISTS wdi_csv_parquet;  
CREATE EXTERNAL TABLE wdi_csv_parquet
(year INTEGER, countryName STRING, countryCode STRING, indicatorName STRING, indicatorCode STRING, indicatorValue FLOAT)  
STORED AS PARQUET
LOCATION 'hdfs:///user/david/hive/wdi/wdi_csv_parquet';

In [34]:
FROM wdi_opencsv_gs
INSERT OVERWRITE TABLE wdi_csv_parquet  
SELECT *;

In [35]:
hdfs dfs -du -s -h /user/david/hive/wdi/wdi_opencsv_text; 
hdfs dfs -du -s -h /user/david/hive/wdi/wdi_csv_parquet;

In [36]:
SELECT count(countryName) FROM wdi_csv_parquet;

In [37]:
SELECT count(countryName) FROM wdi_opencsv_text;

In [38]:
SELECT indicatorvalue, countryname, year
FROM wdi_csv_parquet
WHERE countryname= "Canada" and indicatorname like "GDP growth (annual %)" and year= "2015";

In [39]:
SELECT indicatorvalue, countryname, year
FROM wdi_opencsv_text
WHERE countryname= "Canada" and indicatorname like "GDP growth (annual %)" and year= "2015";

Highest GDP Growth

In [41]:
SELECT parquet1.indicatorValue, parquet1.year, parquet1.countryName   
FROM wdi_csv_parquet parquet1    
INNER JOIN (SELECT MAX(indicatorValue) AS maxValue, countryName       
FROM wdi_csv_parquet       
WHERE indicatorCode = "NY.GDP.MKTP.KD.ZG" AND indicatorValue <> 0    
GROUP BY countryName) parquet2
ON parquet1.indicatorValue = parquet2.maxValue AND parquet1.countryName = parquet2.countryName;

In [42]:
%spark.sql
SELECT parquet1.indicatorValue, parquet1.year, parquet1.countryName   
FROM wdi_csv_parquet parquet1    
INNER JOIN (SELECT MAX(indicatorValue) AS maxValue, countryName       
FROM wdi_csv_parquet       
WHERE indicatorCode = "NY.GDP.MKTP.KD.ZG" AND indicatorValue <> 0    
GROUP BY countryName) parquet2
ON parquet1.indicatorValue = parquet2.maxValue AND parquet1.countryName = parquet2.countryName;

Sort GDP by country and year

In [44]:
SELECT countryName, year, indicatorCode, indicatorValue
FROM wdi_csv_parquet 
WHERE indicatorCode = "NY.GDP.MKTP.KD.ZG" AND indicatorValue <> 0    
ORDER BY countryName, year;