In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

# mini exercise

In [2]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

## 1. Spark Dataframe Basics

- Use the starter code above to create a pandas dataframe.

In [6]:
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


- Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.

In [3]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

- Show the first 3 rows of the dataframe.

In [7]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



- Show the first 7 rows of the dataframe.

In [8]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



- View a summary of the data using .describe.

In [34]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [33]:
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.36640264498852165| null|
| stddev| 0.8905322898155364| null|
|    min| -1.261605945319069|    x|
|    max| 2.1503829673811126|    z|
+-------+-------------------+-----+



- Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [10]:
df2 = df.select('n', 'abool')

df2.show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



- Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [11]:
df3 = df.select('group', 'abool')

df3.show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



- Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [12]:
abool = df.abool

In [15]:
df4 = df.select('group', abool.alias('a_boolean_value'))

df4.show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



- Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [16]:
n = df.n

In [17]:
df5 = df.select('group', n.alias('a_numeric_value'))

df5.show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



## 2. Column Manipulation

- Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

In [19]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Use .select to add 4 to the n column. Show the results.

In [21]:
df6 = df.select((n + 4), 'group', 'abool')

df6.show(3)

+------------------+-----+-----+
|           (n + 4)|group|abool|
+------------------+-----+-----+
|3.2876093379494122|    z|false|
| 4.753766378659703|    x|false|
|3.9554969216619464|    z|false|
+------------------+-----+-----+
only showing top 3 rows



- Subtract 5 from the n column and view the results.

In [22]:
df7 = df.select((n - 5), 'group', 'abool')

df7.show(3)

+------------------+-----+-----+
|           (n - 5)|group|abool|
+------------------+-----+-----+
|-5.712390662050588|    z|false|
|-4.246233621340297|    x|false|
|-5.044503078338053|    z|false|
+------------------+-----+-----+
only showing top 3 rows



- Multiply the n column by 2. View the results along with the original numbers.

In [23]:
df8 = df.select('n', (n * 2), 'group', 'abool')

df8.show(3)

+--------------------+--------------------+-----+-----+
|                   n|             (n * 2)|group|abool|
+--------------------+--------------------+-----+-----+
|  -0.712390662050588|  -1.424781324101176|    z|false|
|   0.753766378659703|   1.507532757319406|    x|false|
|-0.04450307833805...|-0.08900615667610691|    z|false|
+--------------------+--------------------+-----+-----+
only showing top 3 rows



- Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [25]:
df9 = df.select('n', (n * -1).alias('n2'), 'group', 'abool')

df9.show(4)

+--------------------+--------------------+-----+-----+
|                   n|                  n2|group|abool|
+--------------------+--------------------+-----+-----+
|  -0.712390662050588|   0.712390662050588|    z|false|
|   0.753766378659703|  -0.753766378659703|    x|false|
|-0.04450307833805...|0.044503078338053455|    z|false|
| 0.45181233874578974|-0.45181233874578974|    y|false|
+--------------------+--------------------+-----+-----+
only showing top 4 rows



- Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [27]:
df10 = df9.select('n', 'n2', (n ** 2).alias('n3'), 'group', 'abool')

df10.show(5)

+--------------------+--------------------+--------------------+-----+-----+
|                   n|                  n2|                  n3|group|abool|
+--------------------+--------------------+--------------------+-----+-----+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|    z|false|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|    x|false|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|    z|false|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|    y|false|
|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|    z|false|
+--------------------+--------------------+--------------------+-----+-----+
only showing top 5 rows



- What happens when you run the code below?

In [28]:
df.group + df.abool

Column<'(group + abool)'>

In [29]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



- What happens when you run the code below? What is the difference between this and the previous code sample?

In [30]:
# df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#1 as double) + abool#2) AS (group + abool)#262]
+- LogicalRDD [n#0, group#1, abool#2], false


- Try adding various other columns together. What are the results of combining the different data types?

In [31]:
df.n + df.abool

Column<'(n + abool)'>

In [32]:
df.group + df.n

Column<'(group + n)'>

## 3. Type casting

- Use the starter code above to re-create a spark dataframe.

In [35]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Use .printSchema to view the datatypes in your dataframe.

In [36]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



- Use .dtypes to view the datatypes in your dataframe.

In [38]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

- What is the difference between the two code samples below?

In [42]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [40]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



- Use .select and .cast to convert the abool column to an integer type. View the results.

In [43]:
df.select('n', 'group', abool.cast('int')).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|    0|
|   0.753766378659703|    x|    0|
|-0.04450307833805...|    z|    0|
| 0.45181233874578974|    y|    0|
|  1.3451017084510097|    z|    0|
+--------------------+-----+-----+
only showing top 5 rows



- Convert the group column to a integer data type and view the results. What happens?

In [46]:
df.select('n', (df.group).cast('int'), 'abool').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588| null|false|
|   0.753766378659703| null|false|
|-0.04450307833805...| null|false|
| 0.45181233874578974| null|false|
|  1.3451017084510097| null|false|
+--------------------+-----+-----+
only showing top 5 rows



- Convert the n column to a integer data type and view the results. What happens?

In [47]:
df.select(n.cast('int'), 'group', 'abool').show(5)

+---+-----+-----+
|  n|group|abool|
+---+-----+-----+
|  0|    z|false|
|  0|    x|false|
|  0|    z|false|
|  0|    y|false|
|  1|    z|false|
+---+-----+-----+
only showing top 5 rows



- Convert the abool column to a string data type and view the results. What happens?

In [49]:
df.select('n', 'group', abool.cast('string')).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



## 4. Built-in Functions

- Use the starter code above to re-create a spark dataframe.

In [50]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Import the necessary functions from pyspark.sql.functions

In [51]:
from pyspark.sql.functions import asc, desc, col

In [58]:
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

- Find the highest n value.

In [56]:
df.sort(desc('n')).show(1)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|2.1503829673811126|    y| true|
+------------------+-----+-----+
only showing top 1 row



- Find the lowest n value.

In [57]:
df.sort(asc('n')).show(1)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|-1.261605945319069|    y|false|
+------------------+-----+-----+
only showing top 1 row



- Find the average n value.

In [59]:
df.select(mean('n')).show()

+-------------------+
|             avg(n)|
+-------------------+
|0.36640264498852165|
+-------------------+



- Use concat to change the group column to say, e.g. "Group: x" or "Group: y"

In [63]:
df.select(concat(lit('Group:'), (df.group))).show(5)

+---------------------+
|concat(Group:, group)|
+---------------------+
|              Group:z|
|              Group:x|
|              Group:z|
|              Group:y|
|              Group:z|
+---------------------+
only showing top 5 rows



- Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [66]:
df.select(concat((df.group), lit(':'), (df.n))).show(5)

+--------------------+
| concat(group, :, n)|
+--------------------+
|z:-0.712390662050588|
| x:0.753766378659703|
|z:-0.044503078338...|
|y:0.4518123387457...|
|z:1.3451017084510097|
+--------------------+
only showing top 5 rows



## 5. When / Otherwise

- Use the starter code above to re-create a spark dataframe.

In [67]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.

In [76]:
df.select((when(df.abool == True, 1).alias('It is true')), (when(df.abool == False, 1).alias('It is false'))).show(5)

+----------+-----------+
|It is true|It is false|
+----------+-----------+
|      null|          1|
|      null|          1|
|      null|          1|
|      null|          1|
|      null|          1|
+----------+-----------+
only showing top 5 rows



- Create a column that contains 0 if n is less than 0, otherwise, the original n value.

In [77]:
df.select(when(df.n < 0, 0).otherwise(df.n).alias('n')).show(5)

+-------------------+
|                  n|
+-------------------+
|                0.0|
|  0.753766378659703|
|                0.0|
|0.45181233874578974|
| 1.3451017084510097|
+-------------------+
only showing top 5 rows



## 6. Filter / Where

- Use the starter code above to re-create a spark dataframe.

In [78]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Use .filter or .where to select just the rows where the group is y and view the results.

In [79]:
df.where(df.group == 'y').show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.45181233874578974|    y|false|
| 0.5323378882945463|    y|false|
|-1.0453771305385342|    y| true|
| -1.261605945319069|    y|false|
| 0.5628467852810314|    y| true|
+-------------------+-----+-----+
only showing top 5 rows



- Select just the columns where the abool column is false and view the results.

In [80]:
df.where(df.abool == False).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Find the columns where the group column is not y.

In [81]:
df.where(df.group != 'y').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Find the columns where n is positive.

In [82]:
df.where(df.n > 0).show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
+-------------------+-----+-----+
only showing top 5 rows



- Find the columns where abool is true and the group column is z.

In [83]:
df.where(df.abool == True).where(df.group == 'z').show(5)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



- Find the columns where abool is false and n is less than 1

In [85]:
df.where(df.abool == False).where(df.n < 1).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
+--------------------+-----+-----+
only showing top 5 rows



- Find the columns where abool is false or n is less than 1

In [87]:
df.filter((df.abool == False) | (df.n < 1)).show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



## 7. Sorting

- Use the starter code above to re-create a spark dataframe.

In [88]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



- Sort by the n value.

In [90]:
df.sort(df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|-0.02677164998644...|    x| true|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  0.9137407048596775|    y|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



- Sort by the group value, both ascending and descending.

In [92]:
df.sort(asc(df.group)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
|  0.6062886568962988|    x|false|
| 0.31735092273633597|    x|false|
|-0.24332625188556253|    y| true|
| -1.0453771305385342|    y| true|
|  0.5323378882945463|    y|false|
|  0.9137407048596775|    y|false|
|  0.5628467852810314|    y| true|
| 0.45181233874578974|    y|false|
|  -1.261605945319069|    y|false|
|  2.1503829673811126|    y| true|
|  1.3501878997225267|    z|false|
|  -0.712390662050588|    z|false|
|  1.4786857374358966|    z| true|
| 0.12730328020698067|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+



In [93]:
df.sort(desc(df.group)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|-0.04450307833805...|    z|false|
|  -0.712390662050588|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  0.5628467852810314|    y| true|
|  -1.261605945319069|    y|false|
| 0.45181233874578974|    y|false|
|  2.1503829673811126|    y| true|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



- Sort by the group value first, then, within each group, sort by n value.

In [96]:
df.sort(asc(df.group), asc(df.n)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+



- Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?

In [97]:
df.sort(df.abool, df.group, df.n).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.9137407048596775|    y|false|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|-0.02677164998644...|    x| true|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
|  0.5628467852810314|    y| true|
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+

