In [22]:
import json, time
import pyspark
import os

In [23]:
from pyspark.sql.types import DataType, BooleanType, NullType, IntegerType, StringType, MapType

In [24]:
from pyspark.sql.functions import udf, explode

### Getting file names

In [25]:
DIR = 'archive'
dir_list = []
dir_list += [os.path.join(DIR,file) for file in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, file))]
dir_list.sort()

In [26]:
# Get latest quarter
quarter_folder = dir_list[-2]

In [27]:
json_files = []
json_files += [os.path.join(quarter_folder, file) for file in os.listdir(quarter_folder)]

In [28]:
json_file = json_files[0]
json_file

'archive/2022.QTR2/0001493152-22-013349.json'

### Initialize PySpark session

In [29]:
spark = pyspark.sql.SparkSession.builder \
    .appName("Decode_json_files") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [30]:
%time
df = spark.read.option("multiline", "true").json(json_file)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs


In [18]:
df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- bs: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- concept: string (nullable = true)
 |    |    |    |-- label: string (nullable = true)
 |    |    |    |-- unit: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |    |-- cf: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- concept: string (nullable = true)
 |    |    |    |-- label: string (nullable = true)
 |    |    |    |-- unit: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |    |-- ic: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- concept: string (nullable = true)
 |    |    |    |-- label: string (nullable = true)
 |    |    |    |-- unit: string (nullable = true)
 |    |    |    |-- value: string (nullable = true)
 |-- endDate: string (nullable = true)
 |-- quarter: string

### Balance sheet spark dataframe

In [19]:
df_bs = df.withColumn("bs", explode('data.bs')).select('bs')

In [20]:
df_bs.printSchema()

root
 |-- bs: struct (nullable = true)
 |    |-- concept: string (nullable = true)
 |    |-- label: string (nullable = true)
 |    |-- unit: string (nullable = true)
 |    |-- value: string (nullable = true)



In [21]:
df_bs.select('bs.concept', 'bs.label', 'bs.unit', 'bs.value').show()

+--------------------+--------------------+----+----------+
|             concept|               label|unit|     value|
+--------------------+--------------------+----+----------+
|       AssetsCurrent|Total current assets| usd|  21649936|
|    CommonStockValue|                    | usd|    116395|
| PreferredStockValue|                    | usd|   2656713|
|LiabilitiesAndSto...|Total liabilities...| usd|  22203742|
|OperatingLeaseLia...|                    | usd|    192535|
|PrepaidExpenseAnd...|                    | usd|    277473|
|              Assets|        Total assets| usd|  22203742|
| NotesPayableCurrent|                    | usd|    122175|
| NotesPayableCurrent|                    | usd|    122175|
|  StockholdersEquity|Total shareholder...| usd|  20541052|
|  StockholdersEquity|Total shareholder...| usd|  20541052|
|AdditionalPaidInC...|                    | usd| 195077466|
|CashAndCashEquiva...|                    | usd|  21372463|
|OtherReceivablesN...|                  