# Manipulating DataFrames in the real world

A look at various techniques to modify the contents of DataFrames in Spark.

## Preparing the environment

### Importing libraries

In [1]:
import pandas as pd
import random

from typing import List

from pyspark.sql.types import (_parse_datatype_string, StructType, StructField,
                               DoubleType, IntegerType, StringType, FloatType)
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings coming from Arrow optimizations.

### Connect to Spark

In [3]:
spark = (SparkSession.builder
                     .config("spark.sql.repl.eagerEval.enabled", True)  # eval DataFrame in notebooks
                     .getOrCreate())

In [4]:
sc = spark.sparkContext

### Loading data

In [5]:
flights_2014 = spark.read.csv('data-sources/AA_DFW_2014_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2014 = flights_2014.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2014["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))

flights_2014.createOrReplaceTempView("flights_2014")
flights_2014.printSchema()
flights_2014.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2014-01-01,5,HNL,519
2014-01-01,7,OGG,505


In [6]:
flights_2015 = spark.read.csv('data-sources/AA_DFW_2015_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2015 = flights_2015.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2015["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))

flights_2015.createOrReplaceTempView("flights_2015")
flights_2015.printSchema()
flights_2015.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2015-01-01,5,HNL,526
2015-01-01,7,OGG,517


In [7]:
flights_2017 = spark.read.csv('data-sources/AA_DFW_2017_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2017 = flights_2017.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2017["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))
flights_2017.createOrReplaceTempView("flights_2017")
flights_2017.printSchema()
flights_2017.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2017-01-01,5,HNL,537
2017-01-01,7,OGG,498


In [8]:
flights_2018 = spark.read.csv('data-sources/AA_DFW_2018_Departures_Short.csv.gz', header=True, inferSchema=True)

# cast to date
flights_2018 = flights_2018.withColumn("Date (MM/DD/YYYY)", 
                                       F.to_date(flights_2018["Date (MM/DD/YYYY)"], "MM/dd/yyyy"))
flights_2018.createOrReplaceTempView("flights_2018")
flights_2018.printSchema()
flights_2018.limit(2)

root
 |-- Date (MM/DD/YYYY): date (nullable = true)
 |-- Flight Number: integer (nullable = true)
 |-- Destination Airport: string (nullable = true)
 |-- Actual elapsed time (Minutes): integer (nullable = true)



Date (MM/DD/YYYY),Flight Number,Destination Airport,Actual elapsed time (Minutes)
2018-01-01,5,HNL,498
2018-01-01,7,OGG,501


In [9]:
dallas_electors = spark.read.csv('data-sources/DallasCouncilVoters.csv.gz', header=True, inferSchema=True)

# cast to date
dallas_electors = dallas_electors.withColumn("DATE", F.to_date(dallas_electors["DATE"], "MM/dd/yyyy"))

dallas_electors.createOrReplaceTempView("dallas_electors")
dallas_electors.printSchema()
dallas_electors.limit(2)

root
 |-- DATE: date (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER_NAME: string (nullable = true)



DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


In [10]:
dallas_votes = spark.read.csv('data-sources/DallasCouncilVotes.csv.gz', header=True, inferSchema=True)

# cast to date
dallas_votes = dallas_votes.withColumn("DATE", F.to_date(dallas_votes["DATE"], "MM/dd/yyyy"))

dallas_votes.createOrReplaceTempView("dallas_votes")
dallas_votes.printSchema()
dallas_votes.limit(2)

root
 |-- DATE: date (nullable = true)
 |-- AGENDA_ITEM_NUMBER: string (nullable = true)
 |-- ITEM_TYPE: string (nullable = true)
 |-- DISTRICT: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- VOTER NAME: string (nullable = true)
 |-- VOTE CAST: string (nullable = true)
 |-- FINAL ACTION TAKEN: string (nullable = true)
 |-- AGENDA ITEM DESCRIPTION: string (nullable = true)
 |-- AGENDA_ID: string (nullable = true)
 |-- VOTE_ID: string (nullable = true)



DATE,AGENDA_ITEM_NUMBER,ITEM_TYPE,DISTRICT,TITLE,VOTER NAME,VOTE CAST,FINAL ACTION TAKEN,AGENDA ITEM DESCRIPTION,AGENDA_ID,VOTE_ID
2017-02-08,1,AGENDA,13,Councilmember,Jennifer S. Gates,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...
2017-02-08,1,AGENDA,14,Councilmember,Philip T. Kingston,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...


In [11]:
people = spark.read.csv('data-sources/people_data_sample.csv', header=True, inferSchema=True)
people.createOrReplaceTempView("people")
people.printSchema()
people.limit(2)

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



name,age,city
Amy Meyer,3,Kimberlyborough
Amy Jones,10,Davidburgh


In [12]:
flight = spark.read.parquet('data-sources/flight-time.parquet')
flight.createOrReplaceTempView("flight")
flight.printSchema()
flight.limit(2)

root
 |-- FL_DATE: date (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)



FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,DEP_TIME,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,CANCELLED,DISTANCE
2000-01-01,DL,1451,BOS,"Boston, MA",ATL,"Atlanta, GA",1115,1113,1343,5,1400,1348,0,946
2000-01-01,DL,1479,BOS,"Boston, MA",ATL,"Atlanta, GA",1315,1311,1536,7,1559,1543,0,946


### Tables catalogue

In [13]:
spark.catalog.listTables()

[Table(name='dallas_electors', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='dallas_votes', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flight', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2014', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2015', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2017', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='flights_2018', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='people', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

## DataFrame column operations

### DataFrame refresher

In [14]:
# Return rows where name starts with "M"
voter_df = dallas_electors.select('*')
voter_df.limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


In [15]:
# Return rows where name starts with "M"
voter_df.filter(voter_df.VOTER_NAME.like('M%')).limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Mayor,Michael S. Rawlings
2017-02-08,Mayor,Michael S. Rawlings


In [16]:
# Return name and position only
voters = voter_df.select('VOTER_NAME', 'TITLE')
voters.limit(2)

VOTER_NAME,TITLE
Jennifer S. Gates,Councilmember
Philip T. Kingston,Councilmember


### Common DataFrame transformations

In [17]:
# Filter / Where
voter_df.filter(voter_df.DATE > pd.to_datetime('2018/01/01')).limit(2)  # or voter_df.where(...)

DATE,TITLE,VOTER_NAME
2018-04-25,Councilmember,Sandy Greyson
2018-04-25,Councilmember,Jennifer S. Gates


In [18]:
# Select
voter_df.select(voter_df.VOTER_NAME).limit(2)

VOTER_NAME
Jennifer S. Gates
Philip T. Kingston


In [19]:
# withColumn
voter_df.withColumn('YEAR', F.year(voter_df.DATE)).limit(2)

DATE,TITLE,VOTER_NAME,YEAR
2017-02-08,Councilmember,Jennifer S. Gates,2017
2017-02-08,Councilmember,Philip T. Kingston,2017


In [20]:
# drop
voter_df.drop('TITLE').limit(2)

DATE,VOTER_NAME
2017-02-08,Jennifer S. Gates
2017-02-08,Philip T. Kingston


In [21]:
# None transformation was applied until the result is asigned
voter_df.limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


### Filtering data

- 
- 
- 
- Negate with ~

In [22]:
# Remove nulls
voter_df.filter(voter_df['VOTER_NAME'].isNotNull())

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston
2017-02-08,Mayor,Michael S. Rawlings
2017-02-08,Councilmember,Adam Medrano
2017-02-08,Councilmember,Casey Thomas
2017-02-08,Councilmember,Carolyn King Arnold
2017-02-08,Councilmember,Scott Griggs
2017-02-08,Councilmember,B. Adam McGough
2017-02-08,Councilmember,Lee Kleinman
2017-02-08,Councilmember,Sandy Greyson


In [23]:
# Comparissons
voter_df.filter(F.year(voter_df.DATE) > 1800).limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


In [24]:
# Text filters
voter_df.where(voter_df['TITLE'].contains('Mayor')).limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Mayor,Michael S. Rawlings
2017-02-08,Mayor,Michael S. Rawlings


In [25]:
# Negate with ~
voter_df.where(~ voter_df.TITLE.isNull()).limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


### Column string transformations

In [26]:
# Applied per column as transformation
voter_df.withColumn('UPPER', F.upper('VOTER_NAME')).limit(2)

DATE,TITLE,VOTER_NAME,UPPER
2017-02-08,Councilmember,Jennifer S. Gates,JENNIFER S. GATES
2017-02-08,Councilmember,Philip T. Kingston,PHILIP T. KINGSTON


In [27]:
# Can create intermediary columns
voter_df.withColumn('SPLITS', F.split('VOTER_NAME', ' ')).limit(2)

DATE,TITLE,VOTER_NAME,SPLITS
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga..."
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King..."


In [28]:
# Can cast to other types
voter_df.withColumn('YEAR', F.year(voter_df.DATE).cast(FloatType())).limit(2)

DATE,TITLE,VOTER_NAME,YEAR
2017-02-08,Councilmember,Jennifer S. Gates,2017.0
2017-02-08,Councilmember,Philip T. Kingston,2017.0


### ArrayType() column functions

Various utility functions / transformations to interact with ArrayType()
- `.size(<column>)` - returns length of arrayType() column
- `.getItem(<index>)` - used to retrieve a specific item at index of list column.

In [29]:
voter_list_df = voter_df.withColumn('LIST_NAME', F.split('VOTER_NAME', ' '))
voter_list_df.limit(2)

DATE,TITLE,VOTER_NAME,LIST_NAME
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga..."
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King..."


In [30]:
# length of arrayType() column
voter_list_df.select(F.size('LIST_NAME')).limit(2)

size(LIST_NAME)
3
3


In [31]:
# Retrieve a specific item at index of list column
voter_list_df.select(voter_list_df.LIST_NAME.getItem(0)).limit(2)

LIST_NAME[0]
Jennifer
Philip


## Ex. 1 - Filtering column content with Python

You've looked at using various operations on DataFrame columns - now you can modify a real dataset. The DataFrame `voter_df` contains information regarding the voters on the Dallas City Council from the past few years. This truncated DataFrame contains the date of the vote being cast and the name and position of the voter. Your manager has asked you to clean this data so it can later be integrated into some desired reports. The primary task is to remove any null entries or odd characters and return a specific set of voters where you can validate their information.

This is often one of the first steps in data cleaning - removing anything that is obviously outside the format. For this dataset, make sure to look at the original data and see what looks out of place for the `VOTER_NAME` column.

**Instructions:**

1. Show the distinct `VOTER_NAME` entries.
2. Filter `voter_df` where the `VOTER_NAME` is 1-20 characters in length.
3. Filter out `voter_df` where the `VOTER_NAME` contains an _.
4. Show the distinct `VOTER_NAME` entries again.

In [32]:
# Show the distinct VOTER_NAME entries
voter_df.select('VOTER_NAME').distinct().show(10, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|VOTER_NAME                                                                                                                                                                                                                                                                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [33]:
# Filter voter_df where the VOTER_NAME is 1-20 characters in length
voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')

# Filter out voter_df where the VOTER_NAME contains an underscore
voter_df = voter_df.filter(~ F.col('VOTER_NAME').contains('_'))

# Show the distinct VOTER_NAME entries again
voter_df.select('VOTER_NAME').distinct().show(40, truncate=False)

+-------------------+
|VOTER_NAME         |
+-------------------+
|Tennell Atkins     |
|Scott Griggs       |
|Scott  Griggs      |
|Sandy Greyson      |
|Michael S. Rawlings|
|Kevin Felder       |
|Adam Medrano       |
|Casey  Thomas      |
|Mark  Clayton      |
|Casey Thomas       |
|Sandy  Greyson     |
|Mark Clayton       |
|Jennifer S.  Gates |
|Tiffinni A. Young  |
|B. Adam  McGough   |
|Omar Narvaez       |
|Philip T. Kingston |
|Rickey D. Callahan |
|Dwaine R. Caraway  |
|Philip T.  Kingston|
|Jennifer S. Gates  |
|Lee M. Kleinman    |
|Monica R. Alonzo   |
|Rickey D.  Callahan|
|Carolyn King Arnold|
|Erik Wilson        |
|Lee Kleinman       |
+-------------------+



## Ex. 2 - Modifying DataFrame columns

Previously, you filtered out any rows that didn't conform to something generally resembling a name. Now based on your earlier work, your manager has asked you to create two new columns - `first_name` and `last_name`. She asks you to split the `VOTER_NAME` column into words on any space character. You'll treat the last word as the `last_name`, and all other words as the `first_name`. You'll be using some new functions in this exercise including `.split()`, `.size()`, and `.getItem()`. The `.getItem(index)` takes an integer value to return the appropriately numbered item in the column. The functions `.split()` and `.size()` are in the `pyspark.sql.functions` library.

Please note that these operations are always somewhat specific to the use case. Having your data conform to a format often matters more than the specific details of the format. Rarely is a data cleaning task meant just for one person - matching a defined format allows for easier sharing of the data later (ie, Paul doesn't need to worry about names - Mary already cleaned the dataset).

**Instructions:**

1. Add a new column called `splits` holding the list of possible names.
2. Use the `getItem()` method and create a new column called `first_name`.
3. Get the last entry of the splits list and create a column called `last_name`.
4. Drop the `splits` column and show the new `voter_df`.

In [34]:
# Review the data
voter_df.limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


In [35]:
# Add a new column called splits separated on whitespace
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,splits
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga..."
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King..."


In [36]:
# Create a new column called first_name based on the first item in splits
voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0))
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,splits,first_name
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga...",Jennifer
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King...",Philip


In [37]:
# Get the last entry of the splits list and create a column called last_name
voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - 1))
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,splits,first_name,last_name
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga...",Jennifer,Gates
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King...",Philip,Kingston


In [38]:
# Drop the splits column
voter_df = voter_df.drop('splits')

# Show the voter_df DataFrame
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,first_name,last_name
2017-02-08,Councilmember,Jennifer S. Gates,Jennifer,Gates
2017-02-08,Councilmember,Philip T. Kingston,Philip,Kingston


## Conditional DataFrame column operations

### .when() - Example

`.when(<if condition>, <then x>)`

In [39]:
# Reviewing the data
people.limit(2)

name,age,city
Amy Meyer,3,Kimberlyborough
Amy Jones,10,Davidburgh


In [40]:
people.select(people.name, people.age, F.when(people.age >= 18, "Adult")).limit(10)

name,age,CASE WHEN (age >= 18) THEN Adult END
Amy Meyer,3,
Amy Jones,10,
Erica King,39,Adult
Lisa Wallace,6,
David Scott,67,Adult
Barbara Owen,23,Adult
Carlos Morgan,9,
Miguel Young,53,Adult
Stephanie Moore,18,Adult
Daniel Davidson,49,Adult


In [41]:
(people.select(people.name, people.age,
               F.when(people.age >= 18, "Adult")
                .when(people.age < 18, "Minor").alias('age_group')).limit(10))

name,age,age_group
Amy Meyer,3,Minor
Amy Jones,10,Minor
Erica King,39,Adult
Lisa Wallace,6,Minor
David Scott,67,Adult
Barbara Owen,23,Adult
Carlos Morgan,9,Minor
Miguel Young,53,Adult
Stephanie Moore,18,Adult
Daniel Davidson,49,Adult


### .otherwise() - Example

`.otherwise()` is like `else`

In [42]:
(people.select(people.name, people.age,
               F.when(people.age >= 18, "Adult")
                .otherwise("Minor").alias('age_group')).limit(10))

name,age,age_group
Amy Meyer,3,Minor
Amy Jones,10,Minor
Erica King,39,Adult
Lisa Wallace,6,Minor
David Scott,67,Adult
Barbara Owen,23,Adult
Carlos Morgan,9,Minor
Miguel Young,53,Adult
Stephanie Moore,18,Adult
Daniel Davidson,49,Adult


## Ex. 3 - when() example

The `when()` clause lets you conditionally modify a Data Frame based on its content. You'll want to modify our `voter_df` DataFrame to add a random number to any voting member that is defined as a `"Councilmember"`.

The `voter_df` DataFrame is defined and available to you. The `pyspark.sql.functions` library is available as `F`. You can use `F.rand()` to generate the random value.

**Instructions:**

1. Add a column to `voter_df` named `random_val` with the results of the `F.rand()` method for any voter with the title `Councilmember`.
2. Show some of the DataFrame rows, noting whether the `.when()` clause worked.

In [43]:
# Review the data
voter_df = dallas_electors.select('*')
voter_df.limit(5).show()

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|2017-02-08|Councilmember|  Jennifer S. Gates|
|2017-02-08|Councilmember| Philip T. Kingston|
|2017-02-08|        Mayor|Michael S. Rawlings|
|2017-02-08|Councilmember|       Adam Medrano|
|2017-02-08|Councilmember|       Casey Thomas|
+----------+-------------+-------------------+



In [44]:
# Add a column to voter_df for any voter with the title **Councilmember**
voter_df = voter_df.withColumn('random_val',
                               F.when(voter_df.TITLE=='Councilmember', F.rand()))

# Show some of the DataFrame rows, noting whether the when clause worked
voter_df.limit(5).show()

+----------+-------------+-------------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|         random_val|
+----------+-------------+-------------------+-------------------+
|2017-02-08|Councilmember|  Jennifer S. Gates|0.44786287320866003|
|2017-02-08|Councilmember| Philip T. Kingston|0.23237691159717644|
|2017-02-08|        Mayor|Michael S. Rawlings|               NULL|
|2017-02-08|Councilmember|       Adam Medrano| 0.9921199713223674|
|2017-02-08|Councilmember|       Casey Thomas|  0.928410906392597|
+----------+-------------+-------------------+-------------------+



## Ex. 4 - When / Otherwise

This requirement is similar to the last, but now you want to add multiple values based on the voter's position. Modify your `voter_df` DataFrame to add a random number to any voting member that is defined as a `Councilmember`. Use `2` for the `Mayor` and `0` for anything other position.

**Instructions:**

1. Add a column to `voter_df` named `random_val` with the results of the `F.rand()` method for any voter with the title `Councilmember`. Set `random_val` to `2` for the `Mayor`. Set any other title to the value `0`.
2. Show some of the Data Frame rows, noting whether the clauses worked.
3. Use the `.filter` clause to find `0` in `random_val`.

In [45]:
# Review the data
voter_df = dallas_electors.select('*')
voter_df.limit(5).show()

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|2017-02-08|Councilmember|  Jennifer S. Gates|
|2017-02-08|Councilmember| Philip T. Kingston|
|2017-02-08|        Mayor|Michael S. Rawlings|
|2017-02-08|Councilmember|       Adam Medrano|
|2017-02-08|Councilmember|       Casey Thomas|
+----------+-------------+-------------------+



In [46]:
# Add a column to voter_df for a voter based on their position
voter_df = voter_df.withColumn('random_val',
                               F.when(voter_df.TITLE == 'Councilmember', F.rand())
                                .when(voter_df.TITLE == 'Mayor', 2)
                                .otherwise(0))

# Show some of the DataFrame rows
voter_df.limit(5).show()

+----------+-------------+-------------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|         random_val|
+----------+-------------+-------------------+-------------------+
|2017-02-08|Councilmember|  Jennifer S. Gates|0.16877539463120927|
|2017-02-08|Councilmember| Philip T. Kingston| 0.9714536022490057|
|2017-02-08|        Mayor|Michael S. Rawlings|                2.0|
|2017-02-08|Councilmember|       Adam Medrano| 0.5821056095556317|
|2017-02-08|Councilmember|       Casey Thomas| 0.7860954514113107|
+----------+-------------+-------------------+-------------------+



In [47]:
# Use the .filter() clause with random_val
voter_df.filter(voter_df.random_val == 0).limit(5).show()

+----------+--------------------+-----------------+----------+
|      DATE|               TITLE|       VOTER_NAME|random_val|
+----------+--------------------+-----------------+----------+
|2018-04-25|Deputy Mayor Pro Tem|     Adam Medrano|       0.0|
|2018-04-25|       Mayor Pro Tem|Dwaine R. Caraway|       0.0|
|2018-06-20|Deputy Mayor Pro Tem|     Adam Medrano|       0.0|
|2018-06-20|       Mayor Pro Tem|Dwaine R. Caraway|       0.0|
|2018-06-20|Deputy Mayor Pro Tem|     Adam Medrano|       0.0|
+----------+--------------------+-----------------+----------+



## User defined functions

User defined functions or UDFs
- Python method
- Wrapped via the `pyspark.sql.functions.udf` method
- Stored as a variable
- Called like a normal Spark function

### Reverse string UDF - Example

In [48]:
# Review the data
people.limit(2)

name,age,city
Amy Meyer,3,Kimberlyborough
Amy Jones,10,Davidburgh


In [49]:
# Define a Python method
def reverseString(mystr: str) -> str:
    '''Return a reversed string, ex. Hola --> aloH'''
    return mystr[::-1]

# Wrap the function and store as a variable
udfReverseString = F.udf(reverseString, StringType())

# Use with Spark
people.withColumn('reversed_name', udfReverseString(people.name)).limit(2)

name,age,city,reversed_name
Amy Meyer,3,Kimberlyborough,reyeM ymA
Amy Jones,10,Davidburgh,senoJ ymA


### Argument-less example

In [50]:
# Define a Python method
def sortingCap() -> str:
    '''Return a random letter from the list ['G', 'H', 'R', 'S']'''
    return random.choice(['G', 'H', 'R', 'S'])

# Wrap the function and store as a variable
udfSortingCap = F.udf(sortingCap, StringType())

# Use with Spark
people.withColumn('Class', udfSortingCap()).limit(2)

name,age,city,Class
Amy Meyer,3,Kimberlyborough,G
Amy Jones,10,Davidburgh,S


## Ex. 5 - Using user defined functions in Spark

You've seen some of the power behind Spark's built-in string functions when it comes to manipulating DataFrames. However, once you reach a certain point, it becomes difficult to process the data in a without creating a rat's nest of function calls. Here's one place where you can use User Defined Functions to manipulate our DataFrames.

For this exercise, we'll use our `voter_df` DataFrame, but you're going to replace the `first_name` column with the `first` and `middle names`.

**Instructions:**

1. Edit the `getFirstAndMiddle()` function to return a space separated string of names, except the last entry in the names list.
2. Define the function as a user-defined function. It should return a string type.
3. Create a new column on `voter_df` called `first_and_middle_name` using your UDF.
4. Show the Data Frame.

In [51]:
# Review the data
voter_df = dallas_electors.select('*')
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))
voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0))
voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - 1))
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,splits,first_name,last_name
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga...",Jennifer,Gates
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King...",Philip,Kingston


In [52]:
# Define a Python method
def getFirstAndMiddle(names: List) -> str:
  '''Return a space separated string of names except the last one'''
  return ' '.join(names[:-1])

# Define the method as a UDF
udfFirstAndMiddle = F.udf(getFirstAndMiddle, StringType())

# Create a new column using your UDF
voter_df = voter_df.withColumn('first_and_middle_name', udfFirstAndMiddle(voter_df.splits))

# Show the DataFrame
voter_df.limit(2)

DATE,TITLE,VOTER_NAME,splits,first_name,last_name,first_and_middle_name
2017-02-08,Councilmember,Jennifer S. Gates,"[Jennifer, S., Ga...",Jennifer,Gates,Jennifer S.
2017-02-08,Councilmember,Philip T. Kingston,"[Philip, T., King...",Philip,Kingston,Philip T.


## Partitioning and lazy processing

### Monotonically increasing IDs

`pyspark.sql.functions.monotonically_increasing_id()`

- Integer (64-bit), increases in value, unique
- Not necessarily sequential (gaps exist)
- Completely parallel

In [53]:
# Review the data
people.limit(2)

name,age,city
Amy Meyer,3,Kimberlyborough
Amy Jones,10,Davidburgh


In [54]:
# Adding ID column
people.withColumn('id', F.monotonically_increasing_id() + 1).limit(5)

name,age,city,id
Amy Meyer,3,Kimberlyborough,1
Amy Jones,10,Davidburgh,2
Erica King,39,Kerrchester,3
Lisa Wallace,6,Port Lauren,4
David Scott,67,South Tonyaside,5


## Ex. 6 - Adding an ID Field

When working with data, you sometimes only want to access certain fields and perform various operations. In this case, find all the unique voter names from the DataFrame and add a unique ID number. Remember that Spark IDs are assigned based on the DataFrame partition - as such the ID values may be much greater than the actual number of rows in the DataFrame.

With Spark's lazy processing, the IDs are not actually generated until an action is performed and can be somewhat random depending on the size of the dataset.

**Instructions:**

1. Select the unique entries from the column `VOTER NAME` and create a new DataFrame called `voter_df`.
2. Count the rows in the `voter_df` DataFrame.
3. Add a `ROW_ID` column using the appropriate Spark function.
4. Show the rows with the 10 highest `ROW_ID`s.

In [55]:
# Review the data
voter_df = dallas_votes.select('*')
voter_df.limit(2)

DATE,AGENDA_ITEM_NUMBER,ITEM_TYPE,DISTRICT,TITLE,VOTER NAME,VOTE CAST,FINAL ACTION TAKEN,AGENDA ITEM DESCRIPTION,AGENDA_ID,VOTE_ID
2017-02-08,1,AGENDA,13,Councilmember,Jennifer S. Gates,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...
2017-02-08,1,AGENDA,14,Councilmember,Philip T. Kingston,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...


In [56]:
# Select all the unique council voters
voter_df = voter_df.select(voter_df["VOTER NAME"]).distinct()

# Count the rows in voter_df
print("\nThere are %d rows in the voter_df DataFrame.\n" % voter_df.count())

# Add a ROW_ID
voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id() + 1)
voter_df.limit(2)


There are 36 rows in the voter_df DataFrame.



VOTER NAME,ROW_ID
Tennell Atkins,1
the final 20...,2


In [57]:
# Show the rows with 10 highest IDs in the set (desc order)
voter_df.orderBy(voter_df.ROW_ID.desc()).limit(10)

VOTER NAME,ROW_ID
,36
Lee Kleinman,35
the final 201...,34
Erik Wilson,33
the final 20...,32
Carolyn King Arnold,31
Rickey D. Callahan,30
the final 2...,29
Monica R. Alonzo,28
Lee M. Kleinman,27


## Ex. 7 - IDs with different partitions

You've just completed adding an ID field to a DataFrame. Now, take a look at what happens when you do the same thing on DataFrames containing a different number of partitions.

To check the number of partitions, use the method `.rdd.getNumPartitions()` on a DataFrame.

**Instructions:**

1. Print the number of partitions on each DataFrame.
2. Add a `ROW_ID` field to each DataFrame.
3. Show the top 10 IDs in each DataFrame.

In [58]:
# Review the data
dallas_votes.limit(2)

DATE,AGENDA_ITEM_NUMBER,ITEM_TYPE,DISTRICT,TITLE,VOTER NAME,VOTE CAST,FINAL ACTION TAKEN,AGENDA ITEM DESCRIPTION,AGENDA_ID,VOTE_ID
2017-02-08,1,AGENDA,13,Councilmember,Jennifer S. Gates,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...
2017-02-08,1,AGENDA,14,Councilmember,Philip T. Kingston,,NO ACTION NEEDED,Call to Order,020817__Special__1,020817__Special__...


In [59]:
# Preparing the data
df1_votes = dallas_votes.select('VOTER NAME').distinct()                 # 1 - partition
df5_votes = dallas_votes.select('VOTER NAME').distinct().repartition(5)  # 5 - partition
df1_votes.count() == df5_votes.count()

True

In [60]:
# Print the number of partitions in each DataFrame
print("There are %d partitions in the df5_votes DataFrame." % df5_votes.rdd.getNumPartitions())
print("There are %d partitions in the df1_votes DataFrame." % df1_votes.rdd.getNumPartitions())

There are 5 partitions in the df5_votes DataFrame.
There are 1 partitions in the df1_votes DataFrame.


In [61]:
# Add a ROW_ID field to each DataFrame
df5_votes = df5_votes.withColumn('ROW_ID', F.monotonically_increasing_id())
df1_votes = df1_votes.withColumn('ROW_ID', F.monotonically_increasing_id())

**Note:** 
> Notice the drastic difference in the `'ROW_ID'` values between the two Data Frames. Understanding how lazy processing and partitioning behave are integral to mastering Spark. Make sure to always test your assumptions when creating a Spark workflow to avoid nasty suprises in production.

In [62]:
# Show the top 10 IDs in df5_votes DataFrame 
df5_votes.orderBy(df5_votes.ROW_ID.desc()).show(36)

+--------------------+-----------+
|          VOTER NAME|     ROW_ID|
+--------------------+-----------+
|        Lee Kleinman|34359738375|
|       Mark  Clayton|34359738374|
|    Casey Thomas, II|34359738373|
|  Philip T. Kingston|34359738372|
|  the  final  201...|34359738371|
|          011018__42|34359738370|
|       Sandy Greyson|34359738369|
|   Casey  Thomas, II|34359738368|
|        Omar Narvaez|25769803782|
|        Mark Clayton|25769803781|
|       Scott  Griggs|25769803780|
|   Jennifer S. Gates|25769803779|
| Michael S. Rawlings|25769803778|
|                NULL|25769803777|
|     Lee M. Kleinman|25769803776|
|      Tennell Atkins|17179869190|
|    Monica R. Alonzo|17179869189|
|        Adam Medrano|17179869188|
| the final 2018 A...|17179869187|
|        Kevin Felder|17179869186|
|   the   final  2...|17179869185|
|   Dwaine R. Caraway|17179869184|
| Carolyn King Arnold| 8589934598|
|  the  final  201...| 8589934597|
|   the   final  2...| 8589934596|
| Rickey D.  Callaha

In [63]:
# Show the top 10 IDs in df1_votes DataFrame 
df1_votes.orderBy(df1_votes.ROW_ID.desc()).show(36)

+--------------------+------+
|          VOTER NAME|ROW_ID|
+--------------------+------+
|                NULL|    35|
|        Lee Kleinman|    34|
|  the  final  201...|    33|
|         Erik Wilson|    32|
|  the  final   20...|    31|
| Carolyn King Arnold|    30|
| Rickey D.  Callahan|    29|
|   the   final  2...|    28|
|    Monica R. Alonzo|    27|
|     Lee M. Kleinman|    26|
|   Jennifer S. Gates|    25|
| Philip T.  Kingston|    24|
|   Dwaine R. Caraway|    23|
|  Rickey D. Callahan|    22|
|  Philip T. Kingston|    21|
|        Omar Narvaez|    20|
|    B. Adam  McGough|    19|
|  the  final  201...|    18|
|   Tiffinni A. Young|    17|
|  Jennifer S.  Gates|    16|
|        Mark Clayton|    15|
|      Sandy  Greyson|    14|
|   Casey  Thomas, II|    13|
|       Mark  Clayton|    12|
|    Casey Thomas, II|    11|
|          011018__42|    10|
|   the   final  2...|     9|
|        Adam Medrano|     8|
|        Kevin Felder|     7|
| the final 2018 A...|     6|
| Michael 

## Ex. 8 - More ID tricks

Once you define a Spark process, you'll likely want to use it many times. Depending on your needs, you may want to start your IDs at a certain value so there isn't overlap with previous runs of the Spark task. This behavior is similar to how IDs would behave in a relational database. You have been given the task to make sure that the IDs output from a monthly Spark task start at the highest value from the previous month.

**Instructions:**

1. Determine the highest `ROW_ID` in `voter_df_march` and save it in the variable `previous_max_ID`. The statement `.rdd.max()[0]` will get the maximum ID.
2. Add a `ROW_ID` column to `voter_df_april` starting at the value of `previous_max_ID`.
3. Show the `ROW_ID`'s from both Data Frames and compare.

In [64]:
# Review the data
dallas_electors.limit(2)

DATE,TITLE,VOTER_NAME
2017-02-08,Councilmember,Jennifer S. Gates
2017-02-08,Councilmember,Philip T. Kingston


In [65]:
# Prepare the data - march
voter_df_march = (dallas_electors.filter(dallas_electors.DATE >= pd.to_datetime('2017-03-01'))
                                 .filter(dallas_electors.DATE <= pd.to_datetime('2017-03-31'))
                                 .select('VOTER_NAME').distinct()
                                 .withColumn('ROW_ID', F.monotonically_increasing_id() + 1))
print('Total rows:', voter_df_march.count())
voter_df_march.limit(2)

Total rows: 15


VOTER_NAME,ROW_ID
Scott Griggs,1
Sandy Greyson,2


In [66]:
# Prepare the data - april
voter_df_april = (dallas_electors.filter(dallas_electors.DATE >= pd.to_datetime('2017-04-01'))
                                 .filter(dallas_electors.DATE <= pd.to_datetime('2017-04-30'))
                                 .select('VOTER_NAME').distinct())
print('Total rows:', voter_df_april.count())
voter_df_april.limit(2)

Total rows: 15


VOTER_NAME
Scott Griggs
Sandy Greyson


In [67]:
# Determine the highest ROW_ID and save it in previous_max_ID
previous_max_ID = voter_df_march.select('ROW_ID').rdd.max()[0]
previous_max_ID

15

In [68]:
print(voter_df_march.selectExpr('max(ROW_ID)').collect()[0])
voter_df_march.selectExpr('max(ROW_ID)').collect()[0]['max(ROW_ID)']

Row(max(ROW_ID)=15)


15

In [69]:
print(voter_df_march.select(F.max('ROW_ID')).collect()[0])
voter_df_march.select(F.max('ROW_ID')).collect()[0]['max(ROW_ID)']

Row(max(ROW_ID)=15)


15

In [70]:
# Add a ROW_ID column to voter_df_april starting at the desired value
voter_df_april = voter_df_april.withColumn('ROW_ID', F.monotonically_increasing_id() + previous_max_ID + 1)
print('Total rows:', voter_df_april.count())
voter_df_april.limit(2)

Total rows: 15


VOTER_NAME,ROW_ID
Scott Griggs,16
Sandy Greyson,17


In [71]:
# Show the ROW_ID from both DataFrames and compare
voter_df_march.select('ROW_ID').show()
voter_df_april.select('ROW_ID').show()

+------+
|ROW_ID|
+------+
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
+------+

+------+
|ROW_ID|
+------+
|    16|
|    17|
|    18|
|    19|
|    20|
|    21|
|    22|
|    23|
|    24|
|    25|
|    26|
|    27|
|    28|
|    29|
|    30|
+------+



## Close session

In [72]:
spark.stop()