In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [51]:
spark = SparkSession.builder.appName("PySparkDFPractice").getOrCreate()

In [52]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

In [53]:
type(data)

list

In [54]:
# Nullable is 3rd Parameter to StructField, defaults to True
schema = StructType([
    StructField("firstname", StringType()),
    StructField("middlename", StringType()),
    StructField("lastname", StringType()),
    StructField("id", StringType()),
    StructField("gender", StringType()),
    StructField("Salary", IntegerType())
])

In [55]:
df = spark.createDataFrame(data, schema=schema)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [56]:
df.show(truncate=False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|Salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



In [57]:
file_path = "./testdata/fire-incidents/fire-incidents.csv"
fire_df = spark.read\
              .format("csv")\
              .option("header", True)\
              .option("inferSchema", True)\
              .load(file_path)

In [58]:
fire_df.select("IncidentNumber", "IncidentDate", "City").show(10)

+--------------+-------------------+-------------+
|IncidentNumber|       IncidentDate|         City|
+--------------+-------------------+-------------+
|      20104668|2020-09-11 00:00:00|San Francisco|
|      20104708|2020-09-11 00:00:00|San Francisco|
|      20104648|2020-09-10 00:00:00|San Francisco|
|      20104598|2020-09-10 00:00:00|San Francisco|
|      20104575|2020-09-10 00:00:00|San Francisco|
|      20104477|2020-09-10 00:00:00|San Francisco|
|      20104443|2020-09-10 00:00:00|San Francisco|
|      20104605|2020-09-10 00:00:00|San Francisco|
|      20104474|2020-09-10 00:00:00|San Francisco|
|      20104652|2020-09-10 00:00:00|San Francisco|
+--------------+-------------------+-------------+
only showing top 10 rows



In [59]:
fire_df.count()

538285

In [60]:
fire_df.printSchema()

root
 |-- IncidentNumber: integer (nullable = true)
 |-- ExposureNumber: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- IncidentDate: timestamp (nullable = true)
 |-- CallNumber: integer (nullable = true)
 |-- AlarmDtTm: timestamp (nullable = true)
 |-- ArrivalDtTm: timestamp (nullable = true)
 |-- CloseDtTm: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIPCode: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- SuppressionUnits: integer (nullable = true)
 |-- SuppressionPersonnel: integer (nullable = true)
 |-- EMSUnits: integer (nullable = true)
 |-- EMSPersonnel: integer (nullable = true)
 |-- OtherUnits: integer (nullable = true)
 |-- OtherPersonnel: integer (nullable = true)
 |-- FirstUnitOnScene: string (nullable = true)
 |-- EstimatedPropertyLoss: integer (nullable = true)
 |-- EstimatedContentsLoss: d

In [61]:
fire_df.columns

['IncidentNumber',
 'ExposureNumber',
 'ID',
 'Address',
 'IncidentDate',
 'CallNumber',
 'AlarmDtTm',
 'ArrivalDtTm',
 'CloseDtTm',
 'City',
 'ZIPCode',
 'Battalion',
 'StationArea',
 'Box',
 'SuppressionUnits',
 'SuppressionPersonnel',
 'EMSUnits',
 'EMSPersonnel',
 'OtherUnits',
 'OtherPersonnel',
 'FirstUnitOnScene',
 'EstimatedPropertyLoss',
 'EstimatedContentsLoss',
 'FireFatalities',
 'FireInjuries',
 'CivilianFatalities',
 'CivilianInjuries',
 'NumberofAlarms',
 'PrimarySituation',
 'MutualAid',
 'ActionTakenPrimary',
 'ActionTakenSecondary',
 'ActionTakenOther',
 'DetectorAlertedOccupants',
 'PropertyUse',
 'AreaofFireOrigin',
 'IgnitionCause',
 'IgnitionFactorPrimary',
 'IgnitionFactorSecondary',
 'HeatSource',
 'ItemFirstIgnited',
 'HumanFactorsAssociatedwithIgnition',
 'StructureType',
 'StructureStatus',
 'FloorofFireOrigin',
 'FireSpread',
 'NoFlameSpead',
 'Numberoffloorswithminimumdamage',
 'Numberoffloorswithsignificantdamage',
 'Numberoffloorswithheavydamage',
 'Numbe

In [62]:
output_path = "./data/output/fire-incidents"
fire_df.write\
      .format("parquet")\
      .mode("overwrite")\
      .save(output_path)

### Working with Structured Operations

##### Reading a JSON File

In [63]:
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType

In [64]:
persons_schema = StructType([
    StructField("id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("fav_movies", ArrayType(StringType())),
    StructField("salary", FloatType()),
    StructField("image_url", StringType()),
    StructField("date_of_birth", DateType()),
    StructField("active", BooleanType())
])

In [65]:
json_file_path = "./data/persons/persons.json"
persons_df = spark.read.json(json_file_path, persons_schema, multiLine=True)

In [66]:
persons_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: float (nullable = true)
 |-- image_url: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- active: boolean (nullable = true)



In [67]:
persons_df.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]                        |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|4  

##### Columns and Expressions

In [68]:
from pyspark.sql.functions import col, concat_ws, expr

In [69]:
persons_df.select(col("first_name"), col("last_name"), col("date_of_birth"))\
          .show(5, truncate=False)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|Drucy     |Poppy    |1991-02-16   |
|Emelyne   |Blaza    |1991-11-02   |
|Max       |Rettie   |1990-03-03   |
|Ilario    |Kean     |1987-06-09   |
|Toddy     |Drexel   |1992-10-28   |
+----------+---------+-------------+
only showing top 5 rows



In [70]:
persons_df.select(expr("first_name"), expr("last_name"), expr("date_of_birth"))\
          .show(5, truncate=False)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|Drucy     |Poppy    |1991-02-16   |
|Emelyne   |Blaza    |1991-11-02   |
|Max       |Rettie   |1990-03-03   |
|Ilario    |Kean     |1987-06-09   |
|Toddy     |Drexel   |1992-10-28   |
+----------+---------+-------------+
only showing top 5 rows



In [71]:
persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                  col("salary"),
                  (col("salary") * 0.10 + col("salary")).alias("salary_increase"))\
          .show(5, truncate=False)

+-------------+-------+------------------+
|full_name    |salary |salary_increase   |
+-------------+-------+------------------+
|Drucy Poppy  |1463.36|1609.6959838867188|
|Emelyne Blaza|3006.04|3306.64404296875  |
|Max Rettie   |1422.88|1565.1680053710938|
|Ilario Kean  |3561.36|3917.4961181640624|
|Toddy Drexel |4934.87|5428.35712890625  |
+-------------+-------+------------------+
only showing top 5 rows



In [72]:
persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                  col("salary"),
                  (expr("salary * 0.10 + salary")).alias("salary_increase"))\
          .show(5, truncate=False)

+-------------+-------+------------------+
|full_name    |salary |salary_increase   |
+-------------+-------+------------------+
|Drucy Poppy  |1463.36|1609.6959838867188|
|Emelyne Blaza|3006.04|3306.64404296875  |
|Max Rettie   |1422.88|1565.1680053710938|
|Ilario Kean  |3561.36|3917.4961181640624|
|Toddy Drexel |4934.87|5428.35712890625  |
+-------------+-------+------------------+
only showing top 5 rows



##### Filter and Where Condition

In [73]:
persons_df.filter("salary <= 3000").show(10, truncate=False)

+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name  |fav_movies                                                                  |salary |image_url                                      |date_of_birth|active|
+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy      |[I giorni contati]                                                          |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|3  |Max       |Rettie     |[The Forgotten Space, Make It Happen]                                       |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|6  |Oswald    |Petrolli   |[Wing and the Thigh, The (L'aile ou la cuisse)]          

In [74]:
persons_df.where("salary <= 3000").show(10, truncate=False)

+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name  |fav_movies                                                                  |salary |image_url                                      |date_of_birth|active|
+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy      |[I giorni contati]                                                          |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|3  |Max       |Rettie     |[The Forgotten Space, Make It Happen]                                       |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|6  |Oswald    |Petrolli   |[Wing and the Thigh, The (L'aile ou la cuisse)]          

In [75]:
persons_df.where((col("salary") <= 3000) & (col("active") == True))\
          .show(10, truncate=False)

+---+----------+---------+-----------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                       |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-----------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                               |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|9  |Emory     |Slocomb  |[Snake and Crane Arts of Shaolin (She hao ba bu), Mala Noche]    |1082.11|http://dummyimage.com/138x226.jpg/cc0000/ffffff|1974-06-08   |true  |
|16 |Margaux   |Archbold |[And Now a Word from Our Sponsor]                                |1013.75|http://dummyimage.com/229x133.png/5fa2dd/ffffff|19

In [76]:
from pyspark.sql.functions import year

In [77]:
persons_df.filter((year("date_of_birth") == 2000) | (year("date_of_birth") == 1989))\
          .show(10, truncate=False)

+---+----------+-----------+----------------------------------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name  |fav_movies                                                                                          |salary |image_url                                      |date_of_birth|active|
+---+----------+-----------+----------------------------------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|14 |Ambrosi   |Vidineev   |[Wall Street: Money Never Sleeps, Applause (Applaus), When a Stranger Calls]                        |4550.88|http://dummyimage.com/232x159.png/5fa2dd/ffffff|1989-07-20   |true  |
|15 |Feodor    |Nancekivell|[Monsoon Wedding]                                                                                   |2218.46|http://dummyimage.com/119x120.bmp/c

In [78]:
from pyspark.sql.functions import array_contains

In [79]:
persons_df.filter(array_contains(persons_df.fav_movies, "Land of the Lost")).show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| 11|   Timothy|   Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



##### Distinct, Drop Duplicates, and Order By

In [80]:
from pyspark.sql.functions import count, desc

In [81]:
persons_df.select("active").show(10)

+------+
|active|
+------+
|  true|
| false|
| false|
|  true|
|  true|
| false|
| false|
| false|
|  true|
|  true|
+------+
only showing top 10 rows



In [82]:
persons_df.select("active").distinct().show()

+------+
|active|
+------+
|  true|
| false|
+------+



In [83]:
persons_df.select(col("first_name"),
                  (year(col("date_of_birth")).alias("year")),
                  col("active")).orderBy("year", "first_name").show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|       Sky|1971| false|
|   Timothy|1971| false|
|    Lucita|1972|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|     Toddy|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
+----------+----+------+
only showing top 10 rows



In [84]:
dropped_df = persons_df.select(col("first_name"),
                               (year(col("date_of_birth"))).alias("year"),
                               col("active"))\
                       .dropDuplicates(["year", "active"])\
                       .orderBy("year", "first_name")

In [85]:
dropped_df.show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
+----------+----+------+
only showing top 10 rows



In [86]:
persons_df.select(col("first_name"),
                  (year(col("date_of_birth")).alias("year")),
                  col("active")).orderBy("year", ascending=False).show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|     Daron|2002|  true|
|    Virgie|2002|  true|
|    Carlen|2002|  true|
|   Lorilee|2002| false|
|    Maxine|2001| false|
|    Feodor|2000| false|
|     Kelcy|2000|  true|
|  Annabell|2000|  true|
|      Redd|2000| false|
|     Jobie|2000| false|
+----------+----+------+
only showing top 10 rows



##### Rows and Union

In [87]:
from pyspark.sql import Row

In [88]:
person_row = Row(
    101,
    "Robert",
    "Own",
    ["Men in Black III", "HomeAlone"],
    4300.64,
    "https://someimage.com",
    "1964-08-18",
    True
)

In [89]:
type(person_row)

pyspark.sql.types.Row

In [90]:
person_rows_list = [Row(
                        102,
                        "Kenny",
                        "Bobien",
                        ["Men in Black III", "HomeAlone"],
                        4300.64,
                        "https://someimage.com",
                        "1964-08-18",
                        True
                    ), Row(
                        103,
                        "Sara",
                        "Devine",
                        ["Men in Black III", "HomeAlone"],
                        4300.64,
                        "https://someimage.com",
                        "1964-08-18",
                        True
                    ), person_row]

In [91]:
print(person_rows_list)

[<Row(102, 'Kenny', 'Bobien', ['Men in Black III', 'HomeAlone'], 4300.64, 'https://someimage.com', '1964-08-18', True)>, <Row(103, 'Sara', 'Devine', ['Men in Black III', 'HomeAlone'], 4300.64, 'https://someimage.com', '1964-08-18', True)>, <Row(101, 'Robert', 'Own', ['Men in Black III', 'HomeAlone'], 4300.64, 'https://someimage.com', '1964-08-18', True)>]


In [92]:
person_row[1]

'Robert'

In [93]:
new_persons_df = spark.createDataFrame(
    person_rows_list,
    ["id", "first_name", "last_name", "fav_movies", "salary", "imageUrl",
     "date_of_birth", "active"]
)

In [94]:
new_persons_df.show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|            imageUrl|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|102|     Kenny|   Bobien|[Men in Black III...|4300.64|https://someimage...|   1964-08-18|  true|
|103|      Sara|   Devine|[Men in Black III...|4300.64|https://someimage...|   1964-08-18|  true|
|101|    Robert|      Own|[Men in Black III...|4300.64|https://someimage...|   1964-08-18|  true|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



In [95]:
persons_df.count()

100

In [96]:
new_persons_df.count()

3

In [97]:
add_persons_df = persons_df.union(new_persons_df)

In [98]:
add_persons_df.count()

103

In [99]:
add_persons_df.sort(desc("id")).show(10, truncate=False)

+---+----------+---------+---------------------------------------------------------------------------------+------------------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                                       |salary            |image_url                                      |date_of_birth|active|
+---+----------+---------+---------------------------------------------------------------------------------+------------------+-----------------------------------------------+-------------+------+
|103|Sara      |Devine   |[Men in Black III, HomeAlone]                                                    |4300.64           |https://someimage.com                          |1964-08-18   |true  |
|102|Kenny     |Bobien   |[Men in Black III, HomeAlone]                                                    |4300.64           |https://someimage.com                          |1964-08-18   |true  |
|101|Robert    

##### Adding, Renaming, and Dropping Columns

In [100]:
from pyspark.sql.functions import round

In [101]:
aug_persons_df1 = persons_df.withColumn("salary_increase", expr("salary * 0.10 + salary"))
aug_persons_df1.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+------------------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|salary_increase   |
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+------------------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |1609.6959838867188|
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |3306.64404296875  |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]            

In [102]:
aug_persons_df1.columns

['id',
 'first_name',
 'last_name',
 'fav_movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'salary_increase']

In [104]:
aug_persons_df2 = aug_persons_df1\
                    .withColumn("birth_year", year("date_of_birth"))\
                    .withColumnRenamed("fav_movies", "movies")\
                    .withColumn("salary_x10", round(col("salary_increase"), 2))\
                    .drop("salary_increase")

In [105]:
aug_persons_df2.columns

['id',
 'first_name',
 'last_name',
 'movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'birth_year',
 'salary_x10']

In [106]:
aug_persons_df2.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
| id|first_name|last_name|              movies| salary|           image_url|date_of_birth|active|birth_year|salary_x10|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|      1991|    1609.7|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|      1991|   3306.64|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|      1990|   1565.17|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|      1987|    3917.5|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|      1992|   5428.36|
|  6|    Oswald| Petrolli|[Wing and the 