# Spark API Mini Exercises

In [26]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

import pandas as pd
import numpy as np

from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when


## 1. Spark Dataframe Basics

In [4]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [5]:
#ii. Convert the pandas dataframe to a spark dataframe. 
#From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [6]:
#iii. Show the first 3 rows of the dataframe.
df.show(3)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  1.345815683152521|    y|false|
|-1.0452821799941256|    x|false|
| 0.9737660452795452|    x|false|
+-------------------+-----+-----+
only showing top 3 rows



In [7]:
#iv. Show the first 7 rows of the dataframe.
df.show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  1.345815683152521|    y|false|
|-1.0452821799941256|    x|false|
| 0.9737660452795452|    x|false|
| -0.867856173967037|    z| true|
|-0.5346956316913865|    x| true|
| 0.5058042271556487|    z|false|
| 0.8595277934341654|    x| true|
+-------------------+-----+-----+
only showing top 7 rows



In [9]:
#v. View a summary of the data using .describe.
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean| 0.3743549326714791| null|
| stddev| 0.8972092768502807| null|
|    min|-1.0452821799941256|    x|
|    max|  1.989506080109543|    z|
+-------+-------------------+-----+



In [10]:
#vi. Use .select to create a new dataframe with just the n and abool columns. 
#View the first 5 rows of this dataframe.
df.select('n', 'abool').show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
|  1.345815683152521|false|
|-1.0452821799941256|false|
| 0.9737660452795452|false|
| -0.867856173967037| true|
|-0.5346956316913865| true|
+-------------------+-----+
only showing top 5 rows



In [11]:
#vii. Use .select to create a new dataframe with just the group and abool columns. 
#View the first 5 rows of this dataframe.
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    y|false|
|    x|false|
|    x|false|
|    z| true|
|    x| true|
+-----+-----+
only showing top 5 rows



In [14]:
#viii. Use .select to create a new dataframe with the group column and 
# the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    y|          false|
|    x|          false|
|    x|          false|
+-----+---------------+
only showing top 3 rows



In [15]:
#ix. Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. 
#Show the first 6 rows of this dataframe.
df.select(df.group, df.n.alias("a_numeric_value")).show(6)

+-----+-------------------+
|group|    a_numeric_value|
+-----+-------------------+
|    y|  1.345815683152521|
|    x|-1.0452821799941256|
|    x| 0.9737660452795452|
|    z| -0.867856173967037|
|    x|-0.5346956316913865|
|    z| 0.5058042271556487|
+-----+-------------------+
only showing top 6 rows



## 2. Column Manipulation

In [38]:
#ii. Use .select to add 4 to the n column. Show the results.
df.select((df.n.cast('double')+4).alias('n+4')).show()

+------------------+
|               n+4|
+------------------+
| 5.345815683152521|
|2.9547178200058744|
| 4.973766045279545|
| 3.132143826032963|
|3.4653043683086135|
| 4.505804227155648|
| 4.859527793434165|
|3.2621761628784087|
| 5.190579458205743|
| 4.804028794242071|
| 5.261199900778375|
|4.0168779323326165|
| 5.229310846388251|
|3.7614279258236563|
| 3.589302243524058|
| 5.407331809584625|
|5.9895060801095426|
| 3.615525094232796|
| 4.362773297125474|
|3.7599793448346346|
+------------------+



In [40]:
#iii.Subtract 5 from the n column and view the results.
df.select((df.n-5).alias('n-5')).show()

+-------------------+
|                n-5|
+-------------------+
| -3.654184316847479|
| -6.045282179994126|
| -4.026233954720455|
| -5.867856173967037|
|-5.5346956316913865|
| -4.494195772844352|
| -4.140472206565835|
| -5.737823837121591|
|-3.8094205417942573|
| -4.195971205757929|
| -3.738800099221625|
|-4.9831220676673835|
| -3.770689153611749|
| -5.238572074176344|
| -5.410697756475942|
|-3.5926681904153748|
| -3.010493919890457|
| -5.384474905767204|
| -4.637226702874526|
| -5.240020655165365|
+-------------------+



In [41]:
#iv. Multiply the n column by 2. View the results along with the original numbers.
df.select(df.n, (df.n*2).alias('n*2')).show()

+--------------------+-------------------+
|                   n|                n*2|
+--------------------+-------------------+
|   1.345815683152521|  2.691631366305042|
| -1.0452821799941256|-2.0905643599882513|
|  0.9737660452795452| 1.9475320905590905|
|  -0.867856173967037| -1.735712347934074|
| -0.5346956316913865| -1.069391263382773|
|  0.5058042271556487| 1.0116084543112973|
|  0.8595277934341654| 1.7190555868683308|
| -0.7378238371215912|-1.4756476742431823|
|  1.1905794582057427| 2.3811589164114855|
|  0.8040287942420706| 1.6080575884841413|
|   1.261199900778375|   2.52239980155675|
|0.016877932332616804|0.03375586466523361|
|  1.2293108463882507| 2.4586216927765014|
| -0.2385720741763439|-0.4771441483526878|
|-0.41069775647594214|-0.8213955129518843|
|   1.407331809584625|   2.81466361916925|
|   1.989506080109543|  3.979012160219086|
| -0.3844749057672039|-0.7689498115344078|
| 0.36277329712547385| 0.7255465942509477|
| -0.2400206551653653|-0.4800413103307306|
+----------

In [64]:
#v. Add a new column named n2 that is the n value multiplied by -1. 
#Show the first 4 rows of your dataframe. 
#You should see the original n value as well as n2.
n2 = (df.n*-1).alias('n2')
df = df.select('*', n2)
df.show(4)

+-------------------+-----+-----+-------------------+
|                  n|group|abool|                 n2|
+-------------------+-----+-----+-------------------+
|  1.345815683152521|    y|false| -1.345815683152521|
|-1.0452821799941256|    x|false| 1.0452821799941256|
| 0.9737660452795452|    x|false|-0.9737660452795452|
| -0.867856173967037|    z| true|  0.867856173967037|
+-------------------+-----+-----+-------------------+
only showing top 4 rows



In [66]:
#vi. Add a new column named n3 that is the n value squared. 
#Show the first 5 rows of your dataframe. You should see both n, n2, and n3.
n3 = (df.n**2).alias('n3')
df = df.select('*', n3)
df.show(5)

+-------------------+-----+-----+-------------------+------------------+
|                  n|group|abool|                 n2|                n3|
+-------------------+-----+-----+-------------------+------------------+
|  1.345815683152521|    y|false| -1.345815683152521|1.8112198530192867|
|-1.0452821799941256|    x|false| 1.0452821799941256|1.0926148358132717|
| 0.9737660452795452|    x|false|-0.9737660452795452|0.9482203109393653|
| -0.867856173967037|    z| true|  0.867856173967037|0.7531743386927039|
|-0.5346956316913865|    x| true| 0.5346956316913865|0.2858994185498508|
+-------------------+-----+-----+-------------------+------------------+
only showing top 5 rows



In [67]:
#vii. What happens when you run the code below?
df.group + df.abool

Column<'(group + abool)'>

In [68]:
df.show()

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|   1.345815683152521|    y|false|  -1.345815683152521|  1.8112198530192867|
| -1.0452821799941256|    x|false|  1.0452821799941256|  1.0926148358132717|
|  0.9737660452795452|    x|false| -0.9737660452795452|  0.9482203109393653|
|  -0.867856173967037|    z| true|   0.867856173967037|  0.7531743386927039|
| -0.5346956316913865|    x| true|  0.5346956316913865|  0.2858994185498508|
|  0.5058042271556487|    z|false| -0.5058042271556487| 0.25583791620852303|
|  0.8595277934341654|    x| true| -0.8595277934341654|  0.7387880276858053|
| -0.7378238371215912|    x| true|  0.7378238371215912|  0.5443840146248283|
|  1.1905794582057427|    x| true| -1.1905794582057427|    1.41747944630148|
|  0.8040287942420706|    z|false| -0.8040287942420706|  0.6464623019703579|

In [75]:
#viii. What happens when you run the code below? 
#What is the difference between this and the previous code sample?

#df.select(df.group + df.abool)

In [70]:
#ix. Try adding various other columns together. 
#What are the results of combining the different data types?
df.select(df.n + df.group)

DataFrame[(n + group): double]

In [71]:
df.n + df.abool

Column<'(n + abool)'>

In [72]:
df.n2 + df.n3

Column<'(n3 + n3)'>

In [74]:
df.select(df.n2 + df.n3).show()

+--------------------+
|           (n2 + n3)|
+--------------------+
|  0.4654041698667657|
|  2.1378970158073973|
|-0.02554573434017...|
|  1.6210305126597409|
|  0.8205950502412372|
|-0.24996631094712563|
| -0.1207397657483601|
|  1.2822078517464195|
|  0.2268999880957372|
| -0.1575664922717127|
|  0.3294252889450078|
|-0.01659306773279...|
|  0.2818943106595466|
| 0.29548870875314687|
|  0.5793704036503144|
|  0.5732510126841102|
|   1.968628362683296|
|  0.5322958589319042|
|-0.23116883201818653|
|  0.2976305700713765|
+--------------------+



## 3. Type casting

In [76]:
#i. Use the starter code above to re-create a spark dataframe.
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [77]:
#ii. Use .printSchema to view the datatypes in your dataframe.
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [80]:
#iii. Use .dtypes to view the datatypes in your dataframe.
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [81]:
#iv. What is the difference between the two code samples below?
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [93]:
df.select(df.abool.cast('int')).show(4)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    1|
+-----+
only showing top 4 rows



**The first example creates a boolean column but does not fill with values. The second fills the boolean values with their respective numeric values**

In [91]:
#v. Use .select and .cast to convert the abool column to an integer type. View the results.
df.select(df.abool.cast('int')).show(4)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    1|
+-----+
only showing top 4 rows



In [88]:
#vi. Convert the group column to a integer data type and view the results. What happens?
df.select(df.group.cast('int')).show(2)

+-----+
|group|
+-----+
| null|
| null|
+-----+
only showing top 2 rows



In [87]:
#vii. Convert the n column to a integer data type and view the results. What happens?
df.select(df.n.cast('int')).show(2)

+---+
|  n|
+---+
|  1|
| -1|
+---+
only showing top 2 rows



In [96]:
#viii. Convert the abool column to a string data type and view the results. What happens?
df.select(df.n.cast('string')).show(3)

+-------------------+
|                  n|
+-------------------+
|  1.345815683152521|
|-1.0452821799941256|
| 0.9737660452795452|
+-------------------+
only showing top 3 rows



## 4. Built-in Functions

In [107]:
#ii. Import the necessary functions from pyspark.sql.functions
from pyspark.sql.functions import asc, desc, col


In [108]:
#iii. Find the highest n value.
df.sort(col('n').desc()).show(1)

+-----------------+-----+-----+
|                n|group|abool|
+-----------------+-----+-----+
|1.989506080109543|    y|false|
+-----------------+-----+-----+
only showing top 1 row



In [109]:
#iv. Find the lowest n value.
df.sort(col('n').asc()).show(1)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-1.0452821799941256|    x|false|
+-------------------+-----+-----+
only showing top 1 row



In [110]:
#v. Find the average n value.
df.select(mean(col('n'))).show()

+------------------+
|            avg(n)|
+------------------+
|0.3743549326714791|
+------------------+



In [None]:
#vi. Use concat to change the group column to say, e.g. "Group: x" or "Group: y"
df.

In [111]:
#vii. Use concat to combine the n and group columns to produce results that look like this: 
#"x: -1.432" or "z: 2.352"

## 5. When / Otherwise

In [112]:
from pyspark.sql.functions import when

In [127]:
#ii.Use when and .otherwise to create a column that contains the text "It is true" when abool is true and 
#"It is false"" when abool is false.
df.select(
    df.abool, when(df.abool == 'true', 'It is true').otherwise('It is false').alias('bool_desc')).show()


+-----+-----------+
|abool|  bool_desc|
+-----+-----------+
|false|It is false|
|false|It is false|
|false|It is false|
| true| It is true|
| true| It is true|
|false|It is false|
| true| It is true|
| true| It is true|
| true| It is true|
|false|It is false|
| true| It is true|
|false|It is false|
| true| It is true|
|false|It is false|
|false|It is false|
| true| It is true|
|false|It is false|
|false|It is false|
| true| It is true|
| true| It is true|
+-----+-----------+



In [129]:
#iii. Create a column that contains 0 if n is less than 0, otherwise, the original n value.
df.select(
    df.n,
    when(df.n < 0, 0)
    .otherwise(df.n).alias('+n')).show()

+--------------------+--------------------+
|                   n|                  +n|
+--------------------+--------------------+
|   1.345815683152521|   1.345815683152521|
| -1.0452821799941256|                 0.0|
|  0.9737660452795452|  0.9737660452795452|
|  -0.867856173967037|                 0.0|
| -0.5346956316913865|                 0.0|
|  0.5058042271556487|  0.5058042271556487|
|  0.8595277934341654|  0.8595277934341654|
| -0.7378238371215912|                 0.0|
|  1.1905794582057427|  1.1905794582057427|
|  0.8040287942420706|  0.8040287942420706|
|   1.261199900778375|   1.261199900778375|
|0.016877932332616804|0.016877932332616804|
|  1.2293108463882507|  1.2293108463882507|
| -0.2385720741763439|                 0.0|
|-0.41069775647594214|                 0.0|
|   1.407331809584625|   1.407331809584625|
|   1.989506080109543|   1.989506080109543|
| -0.3844749057672039|                 0.0|
| 0.36277329712547385| 0.36277329712547385|
| -0.2400206551653653|          

## 6. Filter / Where

In [130]:
#ii. Use .filter or .where to select just the rows where the group is y and view the results.
df.filter(df.group == 'y').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|   1.345815683152521|    y|false|
|0.016877932332616804|    y|false|
|  1.2293108463882507|    y| true|
|   1.989506080109543|    y|false|
| -0.3844749057672039|    y|false|
+--------------------+-----+-----+



In [138]:
#iii. Select just the columns where the abool column is false and view the results.
df.filter(df.abool == 'false').show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  1.345815683152521|    y|false|
|-1.0452821799941256|    x|false|
| 0.9737660452795452|    x|false|
| 0.5058042271556487|    z|false|
| 0.8040287942420706|    z|false|
+-------------------+-----+-----+
only showing top 5 rows



In [139]:
#iv. Find the columns where the group column is not y.
df.filter(df.abool != 'y').show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  1.345815683152521|    y|false|
|-1.0452821799941256|    x|false|
| 0.9737660452795452|    x|false|
| 0.5058042271556487|    z|false|
| 0.8040287942420706|    z|false|
+-------------------+-----+-----+
only showing top 5 rows



In [140]:
#v. Find the columns where n is positive.
df.filter(df.n > 0).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|   1.345815683152521|    y|false|
|  0.9737660452795452|    x|false|
|  0.5058042271556487|    z|false|
|  0.8595277934341654|    x| true|
|  1.1905794582057427|    x| true|
|  0.8040287942420706|    z|false|
|   1.261199900778375|    z| true|
|0.016877932332616804|    y|false|
|  1.2293108463882507|    y| true|
|   1.407331809584625|    z| true|
|   1.989506080109543|    y|false|
| 0.36277329712547385|    x| true|
+--------------------+-----+-----+



In [141]:
#vi. Find the columns where abool is true and the group column is z.
df.where(df.abool == 'true').where(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|-0.867856173967037|    z| true|
| 1.261199900778375|    z| true|
| 1.407331809584625|    z| true|
+------------------+-----+-----+



In [143]:
#vii. Find the columns where abool is true or the group column is z.
df.where((df.abool == 'true') | (df.group == 'z')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.867856173967037|    z| true|
| -0.5346956316913865|    x| true|
|  0.5058042271556487|    z|false|
|  0.8595277934341654|    x| true|
| -0.7378238371215912|    x| true|
|  1.1905794582057427|    x| true|
|  0.8040287942420706|    z|false|
|   1.261199900778375|    z| true|
|  1.2293108463882507|    y| true|
|-0.41069775647594214|    z|false|
|   1.407331809584625|    z| true|
| 0.36277329712547385|    x| true|
| -0.2400206551653653|    x| true|
+--------------------+-----+-----+



In [144]:
#viii. Find the columns where abool is false and n is less than 1
df.where(df.abool == 'false').where(df.n < 1).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.0452821799941256|    x|false|
|  0.9737660452795452|    x|false|
|  0.5058042271556487|    z|false|
|  0.8040287942420706|    z|false|
|0.016877932332616804|    y|false|
| -0.2385720741763439|    x|false|
|-0.41069775647594214|    z|false|
| -0.3844749057672039|    y|false|
+--------------------+-----+-----+



In [145]:
#ix. Find the columns where abool is false or n is less than 1
df.where((df.abool == 'false') | (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|   1.345815683152521|    y|false|
| -1.0452821799941256|    x|false|
|  0.9737660452795452|    x|false|
|  -0.867856173967037|    z| true|
| -0.5346956316913865|    x| true|
|  0.5058042271556487|    z|false|
|  0.8595277934341654|    x| true|
| -0.7378238371215912|    x| true|
|  0.8040287942420706|    z|false|
|0.016877932332616804|    y|false|
| -0.2385720741763439|    x|false|
|-0.41069775647594214|    z|false|
|   1.989506080109543|    y|false|
| -0.3844749057672039|    y|false|
| 0.36277329712547385|    x| true|
| -0.2400206551653653|    x| true|
+--------------------+-----+-----+



## 7. Sorting

In [146]:
#ii. Sort by the n value.

In [147]:
df.sort(col('n').desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|   1.989506080109543|    y|false|
|   1.407331809584625|    z| true|
|   1.345815683152521|    y|false|
|   1.261199900778375|    z| true|
|  1.2293108463882507|    y| true|
|  1.1905794582057427|    x| true|
|  0.9737660452795452|    x|false|
|  0.8595277934341654|    x| true|
|  0.8040287942420706|    z|false|
|  0.5058042271556487|    z|false|
| 0.36277329712547385|    x| true|
|0.016877932332616804|    y|false|
| -0.2385720741763439|    x|false|
| -0.2400206551653653|    x| true|
| -0.3844749057672039|    y|false|
|-0.41069775647594214|    z|false|
| -0.5346956316913865|    x| true|
| -0.7378238371215912|    x| true|
|  -0.867856173967037|    z| true|
| -1.0452821799941256|    x|false|
+--------------------+-----+-----+



In [148]:
#iii. Sort by the group value, both ascending and descending.
df.sort(col('group').asc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.0452821799941256|    x|false|
|  0.9737660452795452|    x|false|
| -0.5346956316913865|    x| true|
| 0.36277329712547385|    x| true|
| -0.2400206551653653|    x| true|
|  0.8595277934341654|    x| true|
| -0.7378238371215912|    x| true|
| -0.2385720741763439|    x|false|
|  1.1905794582057427|    x| true|
|0.016877932332616804|    y|false|
|  1.2293108463882507|    y| true|
|   1.989506080109543|    y|false|
| -0.3844749057672039|    y|false|
|   1.345815683152521|    y|false|
|-0.41069775647594214|    z|false|
|  -0.867856173967037|    z| true|
|   1.407331809584625|    z| true|
|  0.8040287942420706|    z|false|
|  0.5058042271556487|    z|false|
|   1.261199900778375|    z| true|
+--------------------+-----+-----+



In [149]:
df.sort(col('group').desc()).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.41069775647594214|    z|false|
|  0.8040287942420706|    z|false|
|  0.5058042271556487|    z|false|
|   1.261199900778375|    z| true|
|  -0.867856173967037|    z| true|
|   1.407331809584625|    z| true|
|  1.2293108463882507|    y| true|
|   1.345815683152521|    y|false|
|   1.989506080109543|    y|false|
| -0.3844749057672039|    y|false|
|0.016877932332616804|    y|false|
| -0.5346956316913865|    x| true|
| 0.36277329712547385|    x| true|
|  1.1905794582057427|    x| true|
| -1.0452821799941256|    x|false|
| -0.2400206551653653|    x| true|
|  0.8595277934341654|    x| true|
| -0.7378238371215912|    x| true|
|  0.9737660452795452|    x|false|
| -0.2385720741763439|    x|false|
+--------------------+-----+-----+



In [None]:
iv. 