# PySpark - Create Data Frame from API

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Read data from API")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/17 15:41:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Create Python function to read data from API
import requests, json


def read_api(url: str):
    normalized_data = dict()
    data = requests.get(api_url).json()
    normalized_data["_data"] = data  # Normalize payload to handle array situtations
    return json.dumps(normalized_data)

In [3]:
api_url = r"https://api.coindesk.com/v1/bpi/currentprice.json"
# api_url = "https://api.wazirx.com/sapi/v1/tickers/24hr"

# Read data into Data Frame
# Create payload rdd
payload = json.loads(read_api(api_url))
payload_rdd = spark.sparkContext.parallelize([payload])

# Read from JSON
df = spark.read.json(payload_rdd)
df.select("_data").printSchema()

                                                                                

root
 |-- _data: struct (nullable = true)
 |    |-- bpi: struct (nullable = true)
 |    |    |-- EUR: struct (nullable = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- rate: string (nullable = true)
 |    |    |    |-- rate_float: double (nullable = true)
 |    |    |    |-- symbol: string (nullable = true)
 |    |    |-- GBP: struct (nullable = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- rate: string (nullable = true)
 |    |    |    |-- rate_float: double (nullable = true)
 |    |    |    |-- symbol: string (nullable = true)
 |    |    |-- USD: struct (nullable = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- rate: string (nullable = true)
 |    |    |    |-- rate_float: double (nullable = true)
 |    |    |    |-- symbol

In [4]:
# Expand root element to read Struct Data
df.select("_data.*").show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------+
|bpi                                                                                                                                                                   |chartName|disclaimer                                                                                                                                                 |time                                                                             |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
df.select("_data.bpi.*").show(truncate=False)

+----------------------------------------+--------------------------------------------------------------+----------------------------------------------------------+
|EUR                                     |GBP                                                           |USD                                                       |
+----------------------------------------+--------------------------------------------------------------+----------------------------------------------------------+
|{EUR, Euro, 62,231.83, 62231.83, &euro;}|{GBP, British Pound Sterling, 51,848.481, 51848.4805, &pound;}|{USD, United States Dollar, 67,390.652, 67390.6518, &#36;}|
+----------------------------------------+--------------------------------------------------------------+----------------------------------------------------------+



In [5]:
# Expand further elements to read USD data
df.select("_data.*").select("bpi.*").select("USD.*").show(truncate=False)

+----+--------------------+----------+----------+------+
|code|description         |rate      |rate_float|symbol|
+----+--------------------+----------+----------+------+
|USD |United States Dollar|67,390.652|67390.6518|&#36; |
+----+--------------------+----------+----------+------+



In [8]:
spark.stop()