# Spark PySpark

## Imports

In [11]:
from pyspark.sql import SparkSession 
from pyspark.sql import types as data_types

In [5]:
spark_session = SparkSession.builder.appName("Essentials").getOrCreate()

In [6]:
df = spark_session.read.json('books.json')

In [8]:
df.show(5)

+---+--------------------+--------------------+----------+--------------------+---------+--------------------+--------------------+-------+--------------------+--------------------+
|_id|             authors|          categories|      isbn|     longDescription|pageCount|       publishedDate|    shortDescription| status|        thumbnailUrl|               title|
+---+--------------------+--------------------+----------+--------------------+---------+--------------------+--------------------+-------+--------------------+--------------------+
|  1|[W. Frank Ableson...|[Open Source, Mob...|1933988673|Android is an ope...|      416|[2009-04-01T00:00...|Unlocking Android...|PUBLISH|https://s3.amazon...|   Unlocking Android|
|  2|[W. Frank Ableson...|              [Java]|1935182722|When it comes to ...|      592|[2011-01-14T00:00...|Android in Action...|PUBLISH|https://s3.amazon...|Android in Action...|
|  3|       [Gojko Adzic]|[Software Enginee...|1617290084|                null|        0|[

In [9]:
df.describe().show()

+-------+--------------------+--------------------+--------------------+------------------+--------------------+-------+--------------------+-------------------+
|summary|                 _id|                isbn|     longDescription|         pageCount|    shortDescription| status|        thumbnailUrl|              title|
+-------+--------------------+--------------------+--------------------+------------------+--------------------+-------+--------------------+-------------------+
|  count|                 431|                 428|                 263|               431|                 160|    431|                 411|                431|
|   mean|   334.1503759398496| 1.06059324479992E11|                null|289.25986078886314|                null|   null|                null|               null|
| stddev|   245.5287238077908|1.006035239595016E12|                null| 260.8733493639254|                null|   null|                null|               null|
|    min|                   

In [10]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- isbn: string (nullable = true)
 |-- longDescription: string (nullable = true)
 |-- pageCount: long (nullable = true)
 |-- publishedDate: struct (nullable = true)
 |    |-- $date: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thumbnailUrl: string (nullable = true)
 |-- title: string (nullable = true)



In [22]:
my_data_schema = [
    data_types.StructField("_id", data_types.StringType(), True),
    data_types.StructField("authors", data_types.ArrayType(
        data_types.StructType(
            [
               data_types.StructField("author", data_types.StringType(), True) 
            ]
        )
        
    )),
    data_types.StructField("categories", data_types.ArrayType(
        data_types.StructType(
            [
               data_types.StructField("category", data_types.StringType(), True) 
            ]
        )
    )),
    data_types.StructField("isbn", data_types.StringType(), True),
    data_types.StructField("longDescription", data_types.StringType(), True),
    data_types.StructField("pageCount", data_types.IntegerType(), True),
    data_types.StructField("publishedDate", data_types.DateType(), True),
    data_types.StructField("shortDescription", data_types.StringType(), True),
    data_types.StructField("status", data_types.StringType(), True),
    data_types.StructField("thumbnailUrl", data_types.StringType(), True),
    data_types.StructField("title", data_types.StringType(), True),
]

In [23]:
my_structure = data_types.StructType(fields=my_data_schema)

In [25]:
df = spark_session.read.json('books.json', my_structure)

In [26]:
df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- category: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- longDescription: string (nullable = true)
 |-- pageCount: integer (nullable = true)
 |-- publishedDate: date (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- status: string (nullable = true)
 |-- thumbnailUrl: string (nullable = true)
 |-- title: string (nullable = true)



In [27]:
df.show(2)

+---+-------+----------+----------+--------------------+---------+-------------+--------------------+-------+--------------------+--------------------+
|_id|authors|categories|      isbn|     longDescription|pageCount|publishedDate|    shortDescription| status|        thumbnailUrl|               title|
+---+-------+----------+----------+--------------------+---------+-------------+--------------------+-------+--------------------+--------------------+
|  1|   null|      null|1933988673|Android is an ope...|      416|         null|Unlocking Android...|PUBLISH|https://s3.amazon...|   Unlocking Android|
|  2|   null|      null|1935182722|When it comes to ...|      592|         null|Android in Action...|PUBLISH|https://s3.amazon...|Android in Action...|
+---+-------+----------+----------+--------------------+---------+-------------+--------------------+-------+--------------------+--------------------+
only showing top 2 rows

