In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
spark = SparkSession.builder.appName("PySparkDFPractice").getOrCreate()

In [4]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

In [5]:
type(data)

list

In [6]:
# Nullable is 3rd Parameter to StructField, defaults to True
schema = StructType([
    StructField("firstname", StringType()),
    StructField("middlename", StringType()),
    StructField("lastname", StringType()),
    StructField("id", StringType()),
    StructField("gender", StringType()),
    StructField("Salary", IntegerType())
])

In [7]:
df = spark.createDataFrame(data, schema=schema)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
df.show(truncate=False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|Salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



In [9]:
file_path = "./testdata/fire-incidents/fire-incidents.csv"
fire_df = spark.read\
              .format("csv")\
              .option("header", True)\
              .option("inferSchema", True)\
              .load(file_path)

In [10]:
fire_df.select("IncidentNumber", "IncidentDate", "City").show(10)

+--------------+-------------------+-------------+
|IncidentNumber|       IncidentDate|         City|
+--------------+-------------------+-------------+
|      20104668|2020-09-11 00:00:00|San Francisco|
|      20104708|2020-09-11 00:00:00|San Francisco|
|      20104648|2020-09-10 00:00:00|San Francisco|
|      20104598|2020-09-10 00:00:00|San Francisco|
|      20104575|2020-09-10 00:00:00|San Francisco|
|      20104477|2020-09-10 00:00:00|San Francisco|
|      20104443|2020-09-10 00:00:00|San Francisco|
|      20104605|2020-09-10 00:00:00|San Francisco|
|      20104474|2020-09-10 00:00:00|San Francisco|
|      20104652|2020-09-10 00:00:00|San Francisco|
+--------------+-------------------+-------------+
only showing top 10 rows



In [11]:
fire_df.count()

538285

In [12]:
fire_df.printSchema()

root
 |-- IncidentNumber: integer (nullable = true)
 |-- ExposureNumber: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- IncidentDate: timestamp (nullable = true)
 |-- CallNumber: integer (nullable = true)
 |-- AlarmDtTm: timestamp (nullable = true)
 |-- ArrivalDtTm: timestamp (nullable = true)
 |-- CloseDtTm: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIPCode: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- SuppressionUnits: integer (nullable = true)
 |-- SuppressionPersonnel: integer (nullable = true)
 |-- EMSUnits: integer (nullable = true)
 |-- EMSPersonnel: integer (nullable = true)
 |-- OtherUnits: integer (nullable = true)
 |-- OtherPersonnel: integer (nullable = true)
 |-- FirstUnitOnScene: string (nullable = true)
 |-- EstimatedPropertyLoss: integer (nullable = true)
 |-- EstimatedContentsLoss: d

In [13]:
fire_df.columns

['IncidentNumber',
 'ExposureNumber',
 'ID',
 'Address',
 'IncidentDate',
 'CallNumber',
 'AlarmDtTm',
 'ArrivalDtTm',
 'CloseDtTm',
 'City',
 'ZIPCode',
 'Battalion',
 'StationArea',
 'Box',
 'SuppressionUnits',
 'SuppressionPersonnel',
 'EMSUnits',
 'EMSPersonnel',
 'OtherUnits',
 'OtherPersonnel',
 'FirstUnitOnScene',
 'EstimatedPropertyLoss',
 'EstimatedContentsLoss',
 'FireFatalities',
 'FireInjuries',
 'CivilianFatalities',
 'CivilianInjuries',
 'NumberofAlarms',
 'PrimarySituation',
 'MutualAid',
 'ActionTakenPrimary',
 'ActionTakenSecondary',
 'ActionTakenOther',
 'DetectorAlertedOccupants',
 'PropertyUse',
 'AreaofFireOrigin',
 'IgnitionCause',
 'IgnitionFactorPrimary',
 'IgnitionFactorSecondary',
 'HeatSource',
 'ItemFirstIgnited',
 'HumanFactorsAssociatedwithIgnition',
 'StructureType',
 'StructureStatus',
 'FloorofFireOrigin',
 'FireSpread',
 'NoFlameSpead',
 'Numberoffloorswithminimumdamage',
 'Numberoffloorswithsignificantdamage',
 'Numberoffloorswithheavydamage',
 'Numbe

In [14]:
output_path = "./data/output/fire-incidents"
fire_df.write\
      .format("parquet")\
      .mode("overwrite")\
      .save(output_path)

### Working with Structured Operations

##### Reading a JSON File

In [15]:
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType

In [16]:
persons_schema = StructType([
    StructField("id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("fav_movies", ArrayType(StringType())),
    StructField("salary", FloatType()),
    StructField("image_url", StringType()),
    StructField("date_of_birth", DateType()),
    StructField("active", BooleanType())
])

In [17]:
json_file_path = "./data/persons/persons.json"
persons_df = spark.read.json(json_file_path, persons_schema, multiLine=True)

In [18]:
persons_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: float (nullable = true)
 |-- image_url: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- active: boolean (nullable = true)



In [19]:
persons_df.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]                        |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|4  

##### Columns and Expressions

In [24]:
from pyspark.sql.functions import col, concat_ws, expr

In [22]:
persons_df.select(col("first_name"), col("last_name"), col("date_of_birth"))\
          .show(5, truncate=False)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|Drucy     |Poppy    |1991-02-16   |
|Emelyne   |Blaza    |1991-11-02   |
|Max       |Rettie   |1990-03-03   |
|Ilario    |Kean     |1987-06-09   |
|Toddy     |Drexel   |1992-10-28   |
+----------+---------+-------------+
only showing top 5 rows



In [23]:
persons_df.select(expr("first_name"), expr("last_name"), expr("date_of_birth"))\
          .show(5, truncate=False)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|Drucy     |Poppy    |1991-02-16   |
|Emelyne   |Blaza    |1991-11-02   |
|Max       |Rettie   |1990-03-03   |
|Ilario    |Kean     |1987-06-09   |
|Toddy     |Drexel   |1992-10-28   |
+----------+---------+-------------+
only showing top 5 rows



In [28]:
persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                  col("salary"),
                  (col("salary") * 0.10 + col("salary")).alias("salary_increase"))\
          .show(5, truncate=False)

+-------------+-------+------------------+
|full_name    |salary |salary_increase   |
+-------------+-------+------------------+
|Drucy Poppy  |1463.36|1609.6959838867188|
|Emelyne Blaza|3006.04|3306.64404296875  |
|Max Rettie   |1422.88|1565.1680053710938|
|Ilario Kean  |3561.36|3917.4961181640624|
|Toddy Drexel |4934.87|5428.35712890625  |
+-------------+-------+------------------+
only showing top 5 rows



In [30]:
persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                  col("salary"),
                  (expr("salary * 0.10 + salary")).alias("salary_increase"))\
          .show(5, truncate=False)

+-------------+-------+------------------+
|full_name    |salary |salary_increase   |
+-------------+-------+------------------+
|Drucy Poppy  |1463.36|1609.6959838867188|
|Emelyne Blaza|3006.04|3306.64404296875  |
|Max Rettie   |1422.88|1565.1680053710938|
|Ilario Kean  |3561.36|3917.4961181640624|
|Toddy Drexel |4934.87|5428.35712890625  |
+-------------+-------+------------------+
only showing top 5 rows

