In [None]:
import sys
!conda install --yes --prefix {sys.prefix} boto3
!{sys.executable} -m pip install git+git://github.com/moj-analytical-services/etl_manager.git#egg=etl_manager

In [1]:
# Remove metastore and data used in this example (Want to start a blank slate)
import os
import shutil

if os.path.isdir('metastore_db') :
    shutil.rmtree('metastore_db')

if os.path.isfile('derby.log') :
    os.remove('derby.log')
    
if os.path.isdir('data/hive_df') :
    shutil.rmtree('data/hive_df')

if os.path.isdir('data/non_hive_df') :
    shutil.rmtree('data/non_hive_df')

In [2]:
# Attempt to combine the two

import boto3
import os
# from dataengineeringutils.utils import read_json

import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, functions as F
from pyspark.storagelevel import StorageLevel

# Import own function library
from pyspark.sql.types import *

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

session = boto3.Session()
credentials = session.get_credentials()


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("hive.exec.dynamic.partition", "true") \
    .config("hive.exec.dynamic.partition.mode", "nonstrict") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") \
    .config("spark.hadoop.fs.s3.awsAccessKeyId", credentials.access_key) \
    .config("spark.hadoop.fs.s3.awsSecretAccessKey", credentials.secret_key) \
    .enableHiveSupport() \
    .getOrCreate()
    
os.getcwd()

'/home/jovyan/Documents/projects/spark_testing'

In [3]:
# Read in test data
diamonds = spark.read.csv('data/diamonds/', mode="FAILFAST", header = True)
diamonds.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)



In [4]:
d_schema = diamonds.schema

## CHANGING SCHEMA TESTS
### missing cols
Write data into partitions (as csvs and jsons) (p(0) has all cols, p(1) is missing z column)

In [5]:
base_dir = 'data/lake/'
outpaths = {}
for file_type in ['csv', 'parquet', 'json'] :
    outpaths[file_type] = base_dir+ f"{file_type}/"
    diamonds.write.mode('overwrite').format(file_type).save(outpaths[file_type] + "p=0/")
    diamonds.drop('z').write.mode('overwrite').format(file_type).save(outpaths[file_type] + "p=1/")

**Try to read over multiple partitions with csv**

In [6]:
df_col_test_csv = spark.read.csv(outpaths['csv'], mode="FAILFAST", schema=d_schema)
df_col_test_csv.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)
 |-- p: integer (nullable = true)



In [7]:
from py4j.protocol import Py4JJavaError

try :
    df_col_test_csv.filter('p=1').show()
except Py4JJavaError :
    print("failed to read csv")

failed to read csv


**Try to read over multiple partitions with json**

In [8]:
df_col_test_json = spark.read.json(outpaths['json'], mode="FAILFAST", schema=d_schema)
df_col_test_json.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)
 |-- p: integer (nullable = true)



In [9]:
try :
    df_col_test_json.filter('p=1').show()
except Py4JJavaError :
    print("failed to read json")

+-----+---------+-----+-------+-----+-----+-----+----+----+----+---+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|  p|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+---+
| 0.23|    Ideal|    E|    SI2| 61.5|   55|  326|3.95|3.98|null|  1|
| 0.21|  Premium|    E|    SI1| 59.8|   61|  326|3.89|3.84|null|  1|
| 0.23|     Good|    E|    VS1| 56.9|   65|  327|4.05|4.07|null|  1|
| 0.29|  Premium|    I|    VS2| 62.4|   58|  334| 4.2|4.23|null|  1|
| 0.31|     Good|    J|    SI2| 63.3|   58|  335|4.34|4.35|null|  1|
| 0.24|Very Good|    J|   VVS2| 62.8|   57|  336|3.94|3.96|null|  1|
| 0.24|Very Good|    I|   VVS1| 62.3|   57|  336|3.95|3.98|null|  1|
| 0.26|Very Good|    H|    SI1| 61.9|   55|  337|4.07|4.11|null|  1|
| 0.22|     Fair|    E|    VS2| 65.1|   61|  337|3.87|3.78|null|  1|
| 0.23|Very Good|    H|    VS1| 59.4|   61|  338|   4|4.05|null|  1|
|  0.3|     Good|    J|    SI1|   64|   55|  339|4.25|4.28|null|  1|
| 0.23|    Ideal|    J|    VS1| 62

**Try to read over multiple partitions with parquet**

In [10]:
df_col_test_parquet = spark.read.parquet(outpaths['parquet'])
df_col_test_parquet.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)
 |-- p: integer (nullable = true)



In [11]:
try :
    df_col_test_parquet.filter('p=1').show()
except Py4JJavaError :
    print("failed to read parquet")

+-----+---------+-----+-------+-----+-----+-----+----+----+----+---+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|  p|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+---+
| 0.23|    Ideal|    E|    SI2| 61.5|   55|  326|3.95|3.98|null|  1|
| 0.21|  Premium|    E|    SI1| 59.8|   61|  326|3.89|3.84|null|  1|
| 0.23|     Good|    E|    VS1| 56.9|   65|  327|4.05|4.07|null|  1|
| 0.29|  Premium|    I|    VS2| 62.4|   58|  334| 4.2|4.23|null|  1|
| 0.31|     Good|    J|    SI2| 63.3|   58|  335|4.34|4.35|null|  1|
| 0.24|Very Good|    J|   VVS2| 62.8|   57|  336|3.94|3.96|null|  1|
| 0.24|Very Good|    I|   VVS1| 62.3|   57|  336|3.95|3.98|null|  1|
| 0.26|Very Good|    H|    SI1| 61.9|   55|  337|4.07|4.11|null|  1|
| 0.22|     Fair|    E|    VS2| 65.1|   61|  337|3.87|3.78|null|  1|
| 0.23|Very Good|    H|    VS1| 59.4|   61|  338|   4|4.05|null|  1|
|  0.3|     Good|    J|    SI1|   64|   55|  339|4.25|4.28|null|  1|
| 0.23|    Ideal|    J|    VS1| 62

## reordered cols
Write data into partitions (as csvs and jsons) (p(0) has current order, p(1) has cols "cut" and "price" in different orders)

In [18]:
base_dir = 'data/lake/'
outpaths = {}
d_reordered_cols = diamonds.columns
d_reordered_cols[1] = 'price'
d_reordered_cols[6] = 'cut'
for file_type in ['csv', 'parquet', 'json'] :
    outpaths[file_type] = base_dir+ f"{file_type}/"
    diamonds.write.mode('overwrite').format(file_type).save(outpaths[file_type] + "p=0/")
    diamonds.select(*d_reordered_cols).write.mode('overwrite').format(file_type).save(outpaths[file_type] + "p=1/")

**Try to read over multiple partitions with csv**

In [19]:
df_col_test_csv = spark.read.csv(outpaths['csv'], mode="FAILFAST", schema=d_schema)
df_col_test_csv.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)
 |-- p: integer (nullable = true)



In [22]:
from py4j.protocol import Py4JJavaError

try :
    df_col_test_csv.select("cut").distinct().show()
except Py4JJavaError :
    print("failed to read csv")

+----+
| cut|
+----+
|2904|
|3210|
|3414|
|3606|
|3959|
|4032|
|4821|
|4937|
|5325|
|5645|
|5925|
|6194|
|6240|
|6613|
|6731|
|7273|
|7711|
|7762|
|9009|
|9030|
+----+
only showing top 20 rows



:point_up: Note csv reads ok but obvs cannot tell difference between col mixup because all cols in schema is a string

**Try to read over multiple partitions with json**

In [23]:
df_col_test_json = spark.read.json(outpaths['json'], mode="FAILFAST", schema=d_schema)
df_col_test_json.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)
 |-- p: integer (nullable = true)



In [24]:
try :
    df_col_test_json.select("cut").distinct().show()
except Py4JJavaError :
    print("failed to read json")

+---------+
|      cut|
+---------+
|  Premium|
|    Ideal|
|     Good|
|     Fair|
|Very Good|
+---------+



**Try to read over multiple partitions with parquet**

In [25]:
df_col_test_parquet = spark.read.parquet(outpaths['parquet'])
df_col_test_parquet.printSchema()

root
 |-- carat: string (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: string (nullable = true)
 |-- table: string (nullable = true)
 |-- price: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- z: string (nullable = true)
 |-- p: integer (nullable = true)



In [27]:
try :
    df_col_test_parquet.select("cut").distinct().show()
except Py4JJavaError :
    print("failed to read parquet")

+---------+
|      cut|
+---------+
|  Premium|
|    Ideal|
|     Good|
|     Fair|
|Very Good|
+---------+

