In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
spark = SparkSession.builder.appName("PySparkDFPractice").getOrCreate()

In [3]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

In [4]:
type(data)

list

In [5]:
# Nullable is 3rd Parameter to StructField, defaults to True
schema = StructType([
    StructField("firstname", StringType()),
    StructField("middlename", StringType()),
    StructField("lastname", StringType()),
    StructField("id", StringType()),
    StructField("gender", StringType()),
    StructField("Salary", IntegerType())
])

In [6]:
df = spark.createDataFrame(data, schema=schema)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [7]:
df.show(truncate=False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|Salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



In [8]:
file_path = "./testdata/fire-incidents/fire-incidents.csv"
fire_df = spark.read\
              .format("csv")\
              .option("header", True)\
              .option("inferSchema", True)\
              .load(file_path)

In [9]:
fire_df.select("IncidentNumber", "IncidentDate", "City").show(10)

+--------------+-------------------+-------------+
|IncidentNumber|       IncidentDate|         City|
+--------------+-------------------+-------------+
|      20104668|2020-09-11 00:00:00|San Francisco|
|      20104708|2020-09-11 00:00:00|San Francisco|
|      20104648|2020-09-10 00:00:00|San Francisco|
|      20104598|2020-09-10 00:00:00|San Francisco|
|      20104575|2020-09-10 00:00:00|San Francisco|
|      20104477|2020-09-10 00:00:00|San Francisco|
|      20104443|2020-09-10 00:00:00|San Francisco|
|      20104605|2020-09-10 00:00:00|San Francisco|
|      20104474|2020-09-10 00:00:00|San Francisco|
|      20104652|2020-09-10 00:00:00|San Francisco|
+--------------+-------------------+-------------+
only showing top 10 rows



In [10]:
fire_df.count()

538285

In [11]:
fire_df.printSchema()

root
 |-- IncidentNumber: integer (nullable = true)
 |-- ExposureNumber: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Address: string (nullable = true)
 |-- IncidentDate: timestamp (nullable = true)
 |-- CallNumber: integer (nullable = true)
 |-- AlarmDtTm: timestamp (nullable = true)
 |-- ArrivalDtTm: timestamp (nullable = true)
 |-- CloseDtTm: timestamp (nullable = true)
 |-- City: string (nullable = true)
 |-- ZIPCode: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- SuppressionUnits: integer (nullable = true)
 |-- SuppressionPersonnel: integer (nullable = true)
 |-- EMSUnits: integer (nullable = true)
 |-- EMSPersonnel: integer (nullable = true)
 |-- OtherUnits: integer (nullable = true)
 |-- OtherPersonnel: integer (nullable = true)
 |-- FirstUnitOnScene: string (nullable = true)
 |-- EstimatedPropertyLoss: integer (nullable = true)
 |-- EstimatedContentsLoss: d

In [12]:
fire_df.columns

['IncidentNumber',
 'ExposureNumber',
 'ID',
 'Address',
 'IncidentDate',
 'CallNumber',
 'AlarmDtTm',
 'ArrivalDtTm',
 'CloseDtTm',
 'City',
 'ZIPCode',
 'Battalion',
 'StationArea',
 'Box',
 'SuppressionUnits',
 'SuppressionPersonnel',
 'EMSUnits',
 'EMSPersonnel',
 'OtherUnits',
 'OtherPersonnel',
 'FirstUnitOnScene',
 'EstimatedPropertyLoss',
 'EstimatedContentsLoss',
 'FireFatalities',
 'FireInjuries',
 'CivilianFatalities',
 'CivilianInjuries',
 'NumberofAlarms',
 'PrimarySituation',
 'MutualAid',
 'ActionTakenPrimary',
 'ActionTakenSecondary',
 'ActionTakenOther',
 'DetectorAlertedOccupants',
 'PropertyUse',
 'AreaofFireOrigin',
 'IgnitionCause',
 'IgnitionFactorPrimary',
 'IgnitionFactorSecondary',
 'HeatSource',
 'ItemFirstIgnited',
 'HumanFactorsAssociatedwithIgnition',
 'StructureType',
 'StructureStatus',
 'FloorofFireOrigin',
 'FireSpread',
 'NoFlameSpead',
 'Numberoffloorswithminimumdamage',
 'Numberoffloorswithsignificantdamage',
 'Numberoffloorswithheavydamage',
 'Numbe

In [13]:
output_path = "./data/output/fire-incidents"
fire_df.write\
      .format("parquet")\
      .mode("overwrite")\
      .save(output_path)

### Working with Structured Operations

##### Reading a JSON File

In [14]:
from pyspark.sql.types import ArrayType, FloatType, DateType, BooleanType

In [15]:
persons_schema = StructType([
    StructField("id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("fav_movies", ArrayType(StringType())),
    StructField("salary", FloatType()),
    StructField("image_url", StringType()),
    StructField("date_of_birth", DateType()),
    StructField("active", BooleanType())
])

In [16]:
json_file_path = "./data/persons/persons.json"
persons_df = spark.read.json(json_file_path, persons_schema, multiLine=True)

In [17]:
persons_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- fav_movies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- salary: float (nullable = true)
 |-- image_url: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- active: boolean (nullable = true)



In [18]:
persons_df.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]                        |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|4  

##### Columns and Expressions

In [19]:
from pyspark.sql.functions import col, concat_ws, expr

In [20]:
persons_df.select(col("first_name"), col("last_name"), col("date_of_birth"))\
          .show(5, truncate=False)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|Drucy     |Poppy    |1991-02-16   |
|Emelyne   |Blaza    |1991-11-02   |
|Max       |Rettie   |1990-03-03   |
|Ilario    |Kean     |1987-06-09   |
|Toddy     |Drexel   |1992-10-28   |
+----------+---------+-------------+
only showing top 5 rows



In [21]:
persons_df.select(expr("first_name"), expr("last_name"), expr("date_of_birth"))\
          .show(5, truncate=False)

+----------+---------+-------------+
|first_name|last_name|date_of_birth|
+----------+---------+-------------+
|Drucy     |Poppy    |1991-02-16   |
|Emelyne   |Blaza    |1991-11-02   |
|Max       |Rettie   |1990-03-03   |
|Ilario    |Kean     |1987-06-09   |
|Toddy     |Drexel   |1992-10-28   |
+----------+---------+-------------+
only showing top 5 rows



In [22]:
persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                  col("salary"),
                  (col("salary") * 0.10 + col("salary")).alias("salary_increase"))\
          .show(5, truncate=False)

+-------------+-------+------------------+
|full_name    |salary |salary_increase   |
+-------------+-------+------------------+
|Drucy Poppy  |1463.36|1609.6959838867188|
|Emelyne Blaza|3006.04|3306.64404296875  |
|Max Rettie   |1422.88|1565.1680053710938|
|Ilario Kean  |3561.36|3917.4961181640624|
|Toddy Drexel |4934.87|5428.35712890625  |
+-------------+-------+------------------+
only showing top 5 rows



In [23]:
persons_df.select(concat_ws(" ", col("first_name"), col("last_name")).alias("full_name"),
                  col("salary"),
                  (expr("salary * 0.10 + salary")).alias("salary_increase"))\
          .show(5, truncate=False)

+-------------+-------+------------------+
|full_name    |salary |salary_increase   |
+-------------+-------+------------------+
|Drucy Poppy  |1463.36|1609.6959838867188|
|Emelyne Blaza|3006.04|3306.64404296875  |
|Max Rettie   |1422.88|1565.1680053710938|
|Ilario Kean  |3561.36|3917.4961181640624|
|Toddy Drexel |4934.87|5428.35712890625  |
+-------------+-------+------------------+
only showing top 5 rows



##### Filter and Where Condition

In [24]:
persons_df.filter("salary <= 3000").show(10, truncate=False)

+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name  |fav_movies                                                                  |salary |image_url                                      |date_of_birth|active|
+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy      |[I giorni contati]                                                          |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|3  |Max       |Rettie     |[The Forgotten Space, Make It Happen]                                       |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|6  |Oswald    |Petrolli   |[Wing and the Thigh, The (L'aile ou la cuisse)]          

In [25]:
persons_df.where("salary <= 3000").show(10, truncate=False)

+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name  |fav_movies                                                                  |salary |image_url                                      |date_of_birth|active|
+---+----------+-----------+----------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy      |[I giorni contati]                                                          |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|3  |Max       |Rettie     |[The Forgotten Space, Make It Happen]                                       |1422.88|http://dummyimage.com/237x140.jpg/ff4444/ffffff|1990-03-03   |false |
|6  |Oswald    |Petrolli   |[Wing and the Thigh, The (L'aile ou la cuisse)]          

In [26]:
persons_df.where((col("salary") <= 3000) & (col("active") == True))\
          .show(10, truncate=False)

+---+----------+---------+-----------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                       |salary |image_url                                      |date_of_birth|active|
+---+----------+---------+-----------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|1  |Drucy     |Poppy    |[I giorni contati]                                               |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |
|9  |Emory     |Slocomb  |[Snake and Crane Arts of Shaolin (She hao ba bu), Mala Noche]    |1082.11|http://dummyimage.com/138x226.jpg/cc0000/ffffff|1974-06-08   |true  |
|16 |Margaux   |Archbold |[And Now a Word from Our Sponsor]                                |1013.75|http://dummyimage.com/229x133.png/5fa2dd/ffffff|19

In [27]:
from pyspark.sql.functions import year

In [28]:
persons_df.filter((year("date_of_birth") == 2000) | (year("date_of_birth") == 1989))\
          .show(10, truncate=False)

+---+----------+-----------+----------------------------------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|id |first_name|last_name  |fav_movies                                                                                          |salary |image_url                                      |date_of_birth|active|
+---+----------+-----------+----------------------------------------------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+
|14 |Ambrosi   |Vidineev   |[Wall Street: Money Never Sleeps, Applause (Applaus), When a Stranger Calls]                        |4550.88|http://dummyimage.com/232x159.png/5fa2dd/ffffff|1989-07-20   |true  |
|15 |Feodor    |Nancekivell|[Monsoon Wedding]                                                                                   |2218.46|http://dummyimage.com/119x120.bmp/c

In [29]:
from pyspark.sql.functions import array_contains

In [30]:
persons_df.filter(array_contains(persons_df.fav_movies, "Land of the Lost")).show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| 11|   Timothy|   Ervine|[Land of the Lost...|1147.61|http://dummyimage...|   1971-06-02| false|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



##### Distinct, Drop Duplicates, and Order By

In [31]:
from pyspark.sql.functions import count, desc

In [32]:
persons_df.select("active").show(10)

+------+
|active|
+------+
|  true|
| false|
| false|
|  true|
|  true|
| false|
| false|
| false|
|  true|
|  true|
+------+
only showing top 10 rows



In [33]:
persons_df.select("active").distinct().show()

+------+
|active|
+------+
|  true|
| false|
+------+



In [34]:
persons_df.select(col("first_name"),
                  (year(col("date_of_birth")).alias("year")),
                  col("active")).orderBy("year", "first_name").show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|       Sky|1971| false|
|   Timothy|1971| false|
|    Lucita|1972|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|     Toddy|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
+----------+----+------+
only showing top 10 rows



In [35]:
dropped_df = persons_df.select(col("first_name"),
                               (year(col("date_of_birth"))).alias("year"),
                               col("active"))\
                       .dropDuplicates(["year", "active"])\
                       .orderBy("year", "first_name")

In [36]:
dropped_df.show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|    Adrian|1971| false|
|   Feodora|1971|  true|
|      Rodi|1972| false|
|  Sherline|1972|  true|
|  Dominica|1973| false|
|    Kelila|1973|  true|
|   Balduin|1974| false|
|     Emory|1974|  true|
|    Janean|1975|  true|
|       Bev|1976|  true|
+----------+----+------+
only showing top 10 rows



In [37]:
persons_df.select(col("first_name"),
                  (year(col("date_of_birth")).alias("year")),
                  col("active")).orderBy("year", ascending=False).show(10)

+----------+----+------+
|first_name|year|active|
+----------+----+------+
|     Daron|2002|  true|
|    Virgie|2002|  true|
|    Carlen|2002|  true|
|   Lorilee|2002| false|
|    Maxine|2001| false|
|    Feodor|2000| false|
|     Kelcy|2000|  true|
|  Annabell|2000|  true|
|      Redd|2000| false|
|     Jobie|2000| false|
+----------+----+------+
only showing top 10 rows



##### Rows and Union

In [38]:
from pyspark.sql import Row

In [39]:
person_row = Row(
    101,
    "Robert",
    "Own",
    ["Men in Black III", "HomeAlone"],
    4300.64,
    "https://someimage.com",
    "1964-08-18",
    True
)

In [40]:
type(person_row)

pyspark.sql.types.Row

In [41]:
person_rows_list = [Row(
                        102,
                        "Kenny",
                        "Bobien",
                        ["Men in Black III", "HomeAlone"],
                        4300.64,
                        "https://someimage.com",
                        "1964-08-18",
                        True
                    ), Row(
                        103,
                        "Sara",
                        "Devine",
                        ["Men in Black III", "HomeAlone"],
                        4300.64,
                        "https://someimage.com",
                        "1964-08-18",
                        True
                    ), person_row]

In [42]:
print(person_rows_list)

[<Row(102, 'Kenny', 'Bobien', ['Men in Black III', 'HomeAlone'], 4300.64, 'https://someimage.com', '1964-08-18', True)>, <Row(103, 'Sara', 'Devine', ['Men in Black III', 'HomeAlone'], 4300.64, 'https://someimage.com', '1964-08-18', True)>, <Row(101, 'Robert', 'Own', ['Men in Black III', 'HomeAlone'], 4300.64, 'https://someimage.com', '1964-08-18', True)>]


In [43]:
person_row[1]

'Robert'

In [44]:
new_persons_df = spark.createDataFrame(
    person_rows_list,
    ["id", "first_name", "last_name", "fav_movies", "salary", "imageUrl",
     "date_of_birth", "active"]
)

In [45]:
new_persons_df.show()

+---+----------+---------+--------------------+-------+--------------------+-------------+------+
| id|first_name|last_name|          fav_movies| salary|            imageUrl|date_of_birth|active|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+
|102|     Kenny|   Bobien|[Men in Black III...|4300.64|https://someimage...|   1964-08-18|  true|
|103|      Sara|   Devine|[Men in Black III...|4300.64|https://someimage...|   1964-08-18|  true|
|101|    Robert|      Own|[Men in Black III...|4300.64|https://someimage...|   1964-08-18|  true|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+



In [46]:
persons_df.count()

100

In [47]:
new_persons_df.count()

3

In [48]:
add_persons_df = persons_df.union(new_persons_df)

In [49]:
add_persons_df.count()

103

In [50]:
add_persons_df.sort(desc("id")).show(10, truncate=False)

+---+----------+---------+---------------------------------------------------------------------------------+------------------+-----------------------------------------------+-------------+------+
|id |first_name|last_name|fav_movies                                                                       |salary            |image_url                                      |date_of_birth|active|
+---+----------+---------+---------------------------------------------------------------------------------+------------------+-----------------------------------------------+-------------+------+
|103|Sara      |Devine   |[Men in Black III, HomeAlone]                                                    |4300.64           |https://someimage.com                          |1964-08-18   |true  |
|102|Kenny     |Bobien   |[Men in Black III, HomeAlone]                                                    |4300.64           |https://someimage.com                          |1964-08-18   |true  |
|101|Robert    

##### Adding, Renaming, and Dropping Columns

In [51]:
from pyspark.sql.functions import round

In [52]:
aug_persons_df1 = persons_df.withColumn("salary_increase", expr("salary * 0.10 + salary"))
aug_persons_df1.show(10, truncate=False)

+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+------------------+
|id |first_name|last_name|fav_movies                                                   |salary |image_url                                      |date_of_birth|active|salary_increase   |
+---+----------+---------+-------------------------------------------------------------+-------+-----------------------------------------------+-------------+------+------------------+
|1  |Drucy     |Poppy    |[I giorni contati]                                           |1463.36|http://dummyimage.com/126x166.png/cc0000/ffffff|1991-02-16   |true  |1609.6959838867188|
|2  |Emelyne   |Blaza    |[Musketeer, The, Topralli]                                   |3006.04|http://dummyimage.com/158x106.bmp/cc0000/ffffff|1991-11-02   |false |3306.64404296875  |
|3  |Max       |Rettie   |[The Forgotten Space, Make It Happen]            

In [53]:
aug_persons_df1.columns

['id',
 'first_name',
 'last_name',
 'fav_movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'salary_increase']

In [54]:
aug_persons_df2 = aug_persons_df1\
                    .withColumn("birth_year", year("date_of_birth"))\
                    .withColumnRenamed("fav_movies", "movies")\
                    .withColumn("salary_x10", round(col("salary_increase"), 2))\
                    .drop("salary_increase")

In [55]:
aug_persons_df2.columns

['id',
 'first_name',
 'last_name',
 'movies',
 'salary',
 'image_url',
 'date_of_birth',
 'active',
 'birth_year',
 'salary_x10']

In [56]:
aug_persons_df2.show(10)

+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
| id|first_name|last_name|              movies| salary|           image_url|date_of_birth|active|birth_year|salary_x10|
+---+----------+---------+--------------------+-------+--------------------+-------------+------+----------+----------+
|  1|     Drucy|    Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|      1991|    1609.7|
|  2|   Emelyne|    Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|      1991|   3306.64|
|  3|       Max|   Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|      1990|   1565.17|
|  4|    Ilario|     Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|      1987|    3917.5|
|  5|     Toddy|   Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|      1992|   5428.36|
|  6|    Oswald| Petrolli|[Wing and the 

##### Working with Missing or Bad Data

In [57]:
bad_movies_list = [Row(None, None, None),
                   Row(None, None, 2020),
                   Row("John Doe", "Awesome Movie", None),
                   Row(None, "Awesome Movie", 2021),
                   Row("Mary Jane", None, 2019),
                   Row("Vikter Duplaix", "Not another teen movie", 2001)]

In [58]:
bad_movies_list

[<Row(None, None, None)>,
 <Row(None, None, 2020)>,
 <Row('John Doe', 'Awesome Movie', None)>,
 <Row(None, 'Awesome Movie', 2021)>,
 <Row('Mary Jane', None, 2019)>,
 <Row('Vikter Duplaix', 'Not another teen movie', 2001)>]

In [59]:
bad_movies_columns = ["actor_name", "movie_title", "produced_year"]

In [60]:
bad_movies_df = spark.createDataFrame(bad_movies_list, schema=bad_movies_columns)

In [61]:
bad_movies_df.show(10, truncate=False)

+--------------+----------------------+-------------+
|actor_name    |movie_title           |produced_year|
+--------------+----------------------+-------------+
|null          |null                  |null         |
|null          |null                  |2020         |
|John Doe      |Awesome Movie         |null         |
|null          |Awesome Movie         |2021         |
|Mary Jane     |null                  |2019         |
|Vikter Duplaix|Not another teen movie|2001         |
+--------------+----------------------+-------------+



In [62]:
bad_movies_df.na.drop().show(truncate=False)

+--------------+----------------------+-------------+
|actor_name    |movie_title           |produced_year|
+--------------+----------------------+-------------+
|Vikter Duplaix|Not another teen movie|2001         |
+--------------+----------------------+-------------+



In [63]:
bad_movies_df.na.drop("any").show(truncate=False)

+--------------+----------------------+-------------+
|actor_name    |movie_title           |produced_year|
+--------------+----------------------+-------------+
|Vikter Duplaix|Not another teen movie|2001         |
+--------------+----------------------+-------------+



In [64]:
bad_movies_df.na.drop("all").show(truncate=False)

+--------------+----------------------+-------------+
|actor_name    |movie_title           |produced_year|
+--------------+----------------------+-------------+
|null          |null                  |2020         |
|John Doe      |Awesome Movie         |null         |
|null          |Awesome Movie         |2021         |
|Mary Jane     |null                  |2019         |
|Vikter Duplaix|Not another teen movie|2001         |
+--------------+----------------------+-------------+



In [65]:
bad_movies_df.filter(col("actor_name").isNull() != True).show(truncate=False)

+--------------+----------------------+-------------+
|actor_name    |movie_title           |produced_year|
+--------------+----------------------+-------------+
|John Doe      |Awesome Movie         |null         |
|Mary Jane     |null                  |2019         |
|Vikter Duplaix|Not another teen movie|2001         |
+--------------+----------------------+-------------+



In [66]:
bad_movies_df.filter(col("actor_name").isNull() != False).show(truncate=False)

+----------+-------------+-------------+
|actor_name|movie_title  |produced_year|
+----------+-------------+-------------+
|null      |null         |null         |
|null      |null         |2020         |
|null      |Awesome Movie|2021         |
+----------+-------------+-------------+



In [67]:
bad_movies_df.describe("produced_year").show()

+-------+-----------------+
|summary|    produced_year|
+-------+-----------------+
|  count|                4|
|   mean|          2015.25|
| stddev|9.535023160258536|
|    min|             2001|
|    max|             2021|
+-------+-----------------+



In [68]:
bad_movies_df.describe("actor_name").show()

+-------+--------------+
|summary|    actor_name|
+-------+--------------+
|  count|             3|
|   mean|          null|
| stddev|          null|
|    min|      John Doe|
|    max|Vikter Duplaix|
+-------+--------------+



##### Working with User Defined Functions

In [69]:
from pyspark.sql.functions import udf

In [70]:
students_list = [("Joe", 85), ("Jane", 90), ("Mary", 55)]

In [71]:
student_columns = ["name", "score"]

In [72]:
students_df = spark.createDataFrame(students_list, schema=student_columns)

In [73]:
students_df.show(truncate=False)

+----+-----+
|name|score|
+----+-----+
|Joe |85   |
|Jane|90   |
|Mary|55   |
+----+-----+



In [74]:
def letter_grade(score: int) -> str:
    grade = None

    if score > 100:
        grade = "Cheating"
    elif score >= 90:
        grade = "A"
    elif score >= 80:
        grade = "B"
    elif score >= 70:
        grade = "C"
    else:
        grade = "F"

    return grade

In [75]:
print(letter_grade(75))

C


In [76]:
letter_grade_udf = udf(letter_grade)

In [77]:
students_df.select("name", "score", letter_grade_udf(col("score")).alias("grade"))\
            .show(truncate=False)

+----+-----+-----+
|name|score|grade|
+----+-----+-----+
|Joe |85   |B    |
|Jane|90   |A    |
|Mary|55   |F    |
+----+-----+-----+



##### Aggregations

In [78]:
flight_file = "./data/flights/flight-summary.csv"
flight_summary_df = spark.read\
    .format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(flight_file)

In [79]:
flight_summary_df.count()

4693

In [80]:
flight_summary_df.show(10, truncate=False)

+-----------+------------------------------------------------+------------+------------+---------+--------------------------------------------+----------------+----------+-----+
|origin_code|origin_airport                                  |origin_city |origin_state|dest_code|dest_airport                                |dest_city       |dest_state|count|
+-----------+------------------------------------------------+------------+------------+---------+--------------------------------------------+----------------+----------+-----+
|BQN        |Rafael Hernández Airport                        |Aguadilla   |PR          |MCO      |Orlando International Airport               |Orlando         |FL        |441  |
|PHL        |Philadelphia International Airport              |Philadelphia|PA          |MCO      |Orlando International Airport               |Orlando         |FL        |4869 |
|MCI        |Kansas City International Airport               |Kansas City |MO          |IAH      |George Bush 

In [81]:
flight_summary_df.printSchema()

root
 |-- origin_code: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- origin_city: string (nullable = true)
 |-- origin_state: string (nullable = true)
 |-- dest_code: string (nullable = true)
 |-- dest_airport: string (nullable = true)
 |-- dest_city: string (nullable = true)
 |-- dest_state: string (nullable = true)
 |-- count: integer (nullable = true)



In [82]:
flight_summary_df = flight_summary_df\
    .withColumnRenamed("count", "flight_count")

In [83]:
flight_summary_df.show(10, truncate=False)

+-----------+------------------------------------------------+------------+------------+---------+--------------------------------------------+----------------+----------+------------+
|origin_code|origin_airport                                  |origin_city |origin_state|dest_code|dest_airport                                |dest_city       |dest_state|flight_count|
+-----------+------------------------------------------------+------------+------------+---------+--------------------------------------------+----------------+----------+------------+
|BQN        |Rafael Hernández Airport                        |Aguadilla   |PR          |MCO      |Orlando International Airport               |Orlando         |FL        |441         |
|PHL        |Philadelphia International Airport              |Philadelphia|PA          |MCO      |Orlando International Airport               |Orlando         |FL        |4869        |
|MCI        |Kansas City International Airport               |Kansas City |

##### count(col) and countDistinct

In [84]:
from pyspark.sql.functions import count, countDistinct

In [86]:
flight_summary_df.select(count("origin_airport"), count("dest_airport")).show(truncate=False)

+---------------------+-------------------+
|count(origin_airport)|count(dest_airport)|
+---------------------+-------------------+
|4693                 |4693               |
+---------------------+-------------------+



In [87]:
bad_movies_df.show(10, truncate=False)

+--------------+----------------------+-------------+
|actor_name    |movie_title           |produced_year|
+--------------+----------------------+-------------+
|null          |null                  |null         |
|null          |null                  |2020         |
|John Doe      |Awesome Movie         |null         |
|null          |Awesome Movie         |2021         |
|Mary Jane     |null                  |2019         |
|Vikter Duplaix|Not another teen movie|2001         |
+--------------+----------------------+-------------+



In [89]:
bad_movies_df.select(count("actor_name"), count("movie_title"), count("produced_year"), count("*").alias("all"))\
    .show(truncate=False)

+-----------------+------------------+--------------------+---+
|count(actor_name)|count(movie_title)|count(produced_year)|all|
+-----------------+------------------+--------------------+---+
|3                |3                 |4                   |6  |
+-----------------+------------------+--------------------+---+



In [93]:
flight_summary_df.select(countDistinct("origin_airport"), countDistinct("dest_airport"))\
    .show(truncate=False)

+------------------------------+----------------------------+
|count(DISTINCT origin_airport)|count(DISTINCT dest_airport)|
+------------------------------+----------------------------+
|322                           |322                         |
+------------------------------+----------------------------+



##### min(col), max(col), sum(col), sumDistinct(col), and avg(col)

In [94]:
from pyspark.sql.functions import avg, max, min, sum, sumDistinct

In [98]:
flight_summary_df.select(
    min("flight_count"),
    max("flight_count")
).show(truncate=False)

+-----------------+-----------------+
|min(flight_count)|max(flight_count)|
+-----------------+-----------------+
|1                |13744            |
+-----------------+-----------------+



In [99]:
flight_summary_df.select(sum("flight_count")).show()

+-----------------+
|sum(flight_count)|
+-----------------+
|          5332914|
+-----------------+



In [100]:
students_df.show()

+----+-----+
|name|score|
+----+-----+
| Joe|   85|
|Jane|   90|
|Mary|   55|
+----+-----+



In [101]:
students_df.select(sum("score")).show()

+----------+
|sum(score)|
+----------+
|       230|
+----------+



In [102]:
students_df = students_df.union(students_df)
students_df.show()

+----+-----+
|name|score|
+----+-----+
| Joe|   85|
|Jane|   90|
|Mary|   55|
| Joe|   85|
|Jane|   90|
|Mary|   55|
+----+-----+



In [104]:
students_df.select(sum("score")).show()

+----------+
|sum(score)|
+----------+
|       460|
+----------+



In [105]:
students_df.select(sumDistinct("score")).show()

+-------------------+
|sum(DISTINCT score)|
+-------------------+
|                230|
+-------------------+



In [106]:
flight_summary_df.select(sum("flight_count")).show()

+-----------------+
|sum(flight_count)|
+-----------------+
|          5332914|
+-----------------+



In [107]:
flight_summary_df.select(sumDistinct("flight_count")).show()

+--------------------------+
|sum(DISTINCT flight_count)|
+--------------------------+
|                   3612257|
+--------------------------+



In [108]:
flight_summary_df.select(avg("flight_count"), sum("flight_count") / count("flight_count")).show()

+------------------+-----------------------------------------+
| avg(flight_count)|(sum(flight_count) / count(flight_count))|
+------------------+-----------------------------------------+
|1136.3549968037503|                       1136.3549968037503|
+------------------+-----------------------------------------+



##### Aggregation with Grouping

In [109]:
flight_summary_df.groupBy("origin_airport").count().orderBy("count", ascending=False)\
    .show(5, truncate=False)

+------------------------------------------------+-----+
|origin_airport                                  |count|
+------------------------------------------------+-----+
|Hartsfield-Jackson Atlanta International Airport|169  |
|Chicago O'Hare International Airport            |162  |
|Dallas/Fort Worth International Airport         |148  |
|Denver International Airport                    |139  |
|Minneapolis-Saint Paul International Airport    |120  |
+------------------------------------------------+-----+
only showing top 5 rows



In [110]:
flight_summary_df.groupBy("origin_airport")\
                .agg(max("flight_count").alias("max_flight_count"))\
                .orderBy("max_flight_count", ascending=False)\
                .show(5, truncate=False)

+----------------------------------------------------------------------+----------------+
|origin_airport                                                        |max_flight_count|
+----------------------------------------------------------------------+----------------+
|San Francisco International Airport                                   |13744           |
|Los Angeles International Airport                                     |13457           |
|John F. Kennedy International Airport (New York International Airport)|12016           |
|McCarran International Airport                                        |9715            |
|LaGuardia Airport (Marine Air Terminal)                               |9639            |
+----------------------------------------------------------------------+----------------+
only showing top 5 rows



In [112]:
flight_summary_df.groupBy("origin_state", "origin_city")\
                .count()\
                .where(col("origin_state") == "CA")\
                .orderBy("count", ascending=False)\
                .show(5, truncate=False)

+------------+-------------+-----+
|origin_state|origin_city  |count|
+------------+-------------+-----+
|CA          |San Francisco|80   |
|CA          |Los Angeles  |80   |
|CA          |San Diego    |47   |
|CA          |Oakland      |35   |
|CA          |Sacramento   |27   |
+------------+-------------+-----+
only showing top 5 rows



In [113]:
flight_summary_df.groupBy("origin_airport")\
                .agg(max("flight_count"),
                     min("flight_count"),
                     sum("flight_count"),
                     count("flight_count"))\
                .show(5, False)

+-------------------------------------------------+-----------------+-----------------+-----------------+-------------------+
|origin_airport                                   |max(flight_count)|min(flight_count)|sum(flight_count)|count(flight_count)|
+-------------------------------------------------+-----------------+-----------------+-----------------+-------------------+
|Melbourne International Airport                  |1332             |1332             |1332             |1                  |
|San Diego International Airport (Lindbergh Field)|6942             |4                |70207            |46                 |
|Eppley Airfield                                  |2083             |1                |16753            |21                 |
|Kahului Airport                                  |8313             |67               |20627            |18                 |
|Austin-Bergstrom International Airport           |4674             |8                |42067            |41           