In [9]:
import os
import findspark
findspark.init()

In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import json
# spark.sql.repl.eagerEval.enabled: Property used to format output tables better

spark = (
    SparkSession
    .builder
    .appName("cg-pyspark-assignment")
    .master("local")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .getOrCreate()
  )
spark

In [11]:
src_path = os.getcwd()
print(src_path)

/Users/keerthan/Projects/FMCG_Analysis/src


In [14]:
test_path = src_path.replace('src', 'test')
test_path

'/Users/keerthan/Projects/FMCG_Analysis/test'

In [15]:
test_file = test_path + '/test.json'
test_file

'/Users/keerthan/Projects/FMCG_Analysis/test/test.json'

In [17]:
simple_df = spark.read.json(test_file)
simple_df

Array1,Num1,Text1,Text2
"[7, 8, 9]",5.0,hello,goodbye
"[77, 88, 99]",6.6,this,that
"[555, 444, 222]",-0.03,yes,no


In [18]:
test_file2 = test_path + '/txt_json.txt'
test_file2

'/Users/keerthan/Projects/FMCG_Analysis/test/test2.txt'

In [31]:
txt_jsondf = spark.read.option('inferSchema', 'True').option('header', 'True').option('delimiter', '|').csv(test_file2)
txt_jsondf.show(100, 0)
txt_jsondf.printSchema()

+------+---------+-----+---------------------------+
|Text1 |Text2    |Num1 |JSON1                      |
+------+---------+-----+---------------------------+
|hello | goodbye |5.0  | {"Sub1":"john", "Sub2":3} |
|this  | that    |6.6  | {"Sub1":"betty", "Sub2":4}|
|yes   | no      |-0.03| {"Sub1":"bobby", "Sub2":5}|
+------+---------+-----+---------------------------+

root
 |-- Text1: string (nullable = true)
 |-- Text2: string (nullable = true)
 |-- Num1: double (nullable = true)
 |-- JSON1: string (nullable = true)



In [33]:
# Define the schema of the JSON string.
schema = StructType([
  StructField("Sub1", StringType()), 
  StructField("Sub2", IntegerType())
])

In [37]:
test2DF = txt_jsondf.withColumn('from_json', from_json(col('JSON1'), schema))
test2DF.show(100, 0)
test2DF.printSchema()

+------+---------+-----+---------------------------+----------+
|Text1 |Text2    |Num1 |JSON1                      |from_json |
+------+---------+-----+---------------------------+----------+
|hello | goodbye |5.0  | {"Sub1":"john", "Sub2":3} |{john, 3} |
|this  | that    |6.6  | {"Sub1":"betty", "Sub2":4}|{betty, 4}|
|yes   | no      |-0.03| {"Sub1":"bobby", "Sub2":5}|{bobby, 5}|
+------+---------+-----+---------------------------+----------+

root
 |-- Text1: string (nullable = true)
 |-- Text2: string (nullable = true)
 |-- Num1: double (nullable = true)
 |-- JSON1: string (nullable = true)
 |-- from_json: struct (nullable = true)
 |    |-- Sub1: string (nullable = true)
 |    |-- Sub2: integer (nullable = true)



In [40]:
# Make a separate column from one of the struct fields.
test3DF = test2DF.withColumn("JSON1_Sub1", col("from_json.Sub1")).withColumn("JSON1_Sub2", col("from_json.Sub2"))
test3DF.show(100, 0)
test3DF.printSchema()

+------+---------+-----+---------------------------+----------+----------+----------+
|Text1 |Text2    |Num1 |JSON1                      |from_json |JSON1_Sub1|JSON1_Sub2|
+------+---------+-----+---------------------------+----------+----------+----------+
|hello | goodbye |5.0  | {"Sub1":"john", "Sub2":3} |{john, 3} |john      |3         |
|this  | that    |6.6  | {"Sub1":"betty", "Sub2":4}|{betty, 4}|betty     |4         |
|yes   | no      |-0.03| {"Sub1":"bobby", "Sub2":5}|{bobby, 5}|bobby     |5         |
+------+---------+-----+---------------------------+----------+----------+----------+

root
 |-- Text1: string (nullable = true)
 |-- Text2: string (nullable = true)
 |-- Num1: double (nullable = true)
 |-- JSON1: string (nullable = true)
 |-- from_json: struct (nullable = true)
 |    |-- Sub1: string (nullable = true)
 |    |-- Sub2: integer (nullable = true)
 |-- JSON1_Sub1: string (nullable = true)
 |-- JSON1_Sub2: integer (nullable = true)



In [41]:
test_file3 = test_path + '/array_json.txt'
test_file3

'/Users/keerthan/Projects/FMCG_Analysis/test/array_json.txt'

In [85]:
array_jsondf = spark.read.options(header='True', inferSchema='True', delimiter='|').csv(test_file3)
array_jsondf.show(100, 0)
array_jsondf.printSchema()

+------+---------+-----+---------------------------------------------------------+
|Text1 |Text2    |Num1 |JSON1                                                    |
+------+---------+-----+---------------------------------------------------------+
|hello | goodbye |5.0  | [{"Sub1":"stop", "Sub2":3}, {"Sub1":"go", "Sub2":6}]    |
|this  | that    |6.6  | [{"Sub1":"eggs", "Sub2":4}, {"Sub1":"bacon", "Sub2":8}] |
|yes   | no      |-0.03| [{"Sub1":"apple", "Sub2":5}, {"Sub1":"pear", "Sub2":10}]|
+------+---------+-----+---------------------------------------------------------+

root
 |-- Text1: string (nullable = true)
 |-- Text2: string (nullable = true)
 |-- Num1: double (nullable = true)
 |-- JSON1: string (nullable = true)



In [91]:
array_json_schema = StructType([
    StructField('Sub1', StringType()),
    StructField('Sub2', IntegerType())
])

In [92]:
def parse_json(array_str):
    listofdict = json.loads(array_str)
    for i in listofdict:
        yield (i['Sub1'], i['Sub2'])

parse_json_udf = udf(parse_json, array_json_schema)

In [93]:
array_jsondf2 = array_jsondf.withColumn('parse_json', parse_json_udf(col('JSON1')))
print(type(array_jsondf2))
array_jsondf2.show(10,0)

<class 'pyspark.sql.dataframe.DataFrame'>


24/03/28 00:29:47 ERROR Executor: Exception in task 0.0 in stage 436.0 (TID 436)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 225, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 146, in dump_stream
    for obj in iterator:
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 214, in _batched
 

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 225, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 146, in dump_stream
    for obj in iterator:
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 214, in _batched
    for item in iterator:
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1070, in mapper
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1070, in <genexpr>
    result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 104, in <lambda>
    return lambda *a: toInternal(f(*a))
                      ^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1072, in toInternal
    raise PySparkValueError(
pyspark.errors.exceptions.base.PySparkValueError: [UNEXPECTED_TUPLE_WITH_STRUCT] Unexpected tuple <generator object parse_json at 0x103565d20> with StructType.


In [70]:
x = """{
    "Name": "Jennifer Smith",
    "Contact Number": 7867567898,
    "Email": "jen123@gmail.com",
    "Hobbies":["Reading", "Sketching", "Horse Riding"]
    }"""
y = json.loads(x)
print(y)
print(type(y))

{'Name': 'Jennifer Smith', 'Contact Number': 7867567898, 'Email': 'jen123@gmail.com', 'Hobbies': ['Reading', 'Sketching', 'Horse Riding']}
<class 'dict'>


In [77]:
a =  """[{"Sub1":"stop", "Sub2":3}, {"Sub1":"go", "Sub2":6}]"""
print(a)
print(type(a))
b = json.loads(a)
print(b)
print(type(b))

[{"Sub1":"stop", "Sub2":3}, {"Sub1":"go", "Sub2":6}]
<class 'str'>
[{'Sub1': 'stop', 'Sub2': 3}, {'Sub1': 'go', 'Sub2': 6}]
<class 'list'>


In [80]:
for i in b:
    print(type(i))
    #row = json.loads(i)
    #print(row)

<class 'dict'>
<class 'dict'>
