In [1]:
import pandas as pd
import numpy as np
import pyspark
import pydataset
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when
from vega_datasets import data
from pyspark.sql.functions import month, year, quarter
from pyspark.sql.functions import *

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/23 10:44:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/23 10:44:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


1. Spark Dataframe Basics

    1. Use the starter code above to create a pandas dataframe.
    1. Convert the pandas dataframe to a spark dataframe. From this point
       forward, do all of your work with the spark dataframe, not the pandas
       dataframe.
    1. Show the first 3 rows of the dataframe.
    1. Show the first 7 rows of the dataframe.
    1. What is the difference between `.show` and `.head`?
    1. View a summary of the data using `.describe`.
    1. Use `.select` to create a new dataframe with just the `n` and `abool`
       columns. View the first 5 rows of this dataframe.
    1. Use `.select` to create a new dataframe with just the `group` and `abool`
       columns. View the first 5 rows of this dataframe.
    1. Use `.select` to create a new dataframe with the `group` column and the
       `abool` column renamed to `a_boolean_value`. Show the first 3 rows of
       this dataframe.
    1. Use `.select` to create a new dataframe with the `group` column and the
       `n` column renamed to `a_numeric_value`. Show the first 6 rows of this
       dataframe.

In [3]:
# B. 
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [4]:
# C. 
df.show(3)

                                                                                

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [5]:
# D. 
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [6]:
# E. 
df.head()

Row(n=-0.712390662050588, group='z', abool=False)

In [7]:
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [8]:
# F.
df.describe().show()

[Stage 8:>                                                          (0 + 8) / 8]

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



                                                                                

In [9]:
# G.
df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [10]:
# H. 
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [11]:
# I. 
df.select('group', df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [12]:
# J. 
df.select('group', df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



2. Column Manipulation

    1. Use the starter code above to re-create a spark dataframe. Store the
       spark dataframe in a varaible named `df`

    1. Use `.select` to add 4 to the `n` column. Show the results.

    1. Subtract 5 from the `n` column and view the results.

    1. Multiply the `n` column by 2. View the results along with the original
       numbers.

    1. Add a new column named `n2` that is the `n` value multiplied by -1. Show
       the first 4 rows of your dataframe. You should see the original `n` value
       as well as `n2`.

    1. Add a new column named `n3` that is the n value squared. Show the first 5
       rows of your dataframe. You should see both `n`, `n2`, and `n3`.

    1. What happens when you run the code below?

        ```python
        df.group + df.abool
        ```

    1. What happens when you run the code below? What is the difference between
       this and the previous code sample?

        ```python
        df.select(df.group + df.abool)
        ```

    1. Try adding various other columns together. What are the results of
       combining the different data types?


In [13]:
# B. 
df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
+------------------+
only showing top 5 rows



In [14]:
# C. 
df.select(df.n - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
+-------------------+
only showing top 5 rows



In [15]:
# D. 
df.select(df.n, df.n * 2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
+--------------------+--------------------+
only showing top 5 rows



In [16]:
# E.
df = df.select('group', 'abool', 'n', (df.n * -1).alias('n2'))
df.show(5)

+-----+-----+--------------------+--------------------+
|group|abool|                   n|                  n2|
+-----+-----+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|
|    x|false|   0.753766378659703|  -0.753766378659703|
|    z|false|-0.04450307833805...|0.044503078338053455|
|    y|false| 0.45181233874578974|-0.45181233874578974|
|    z|false|  1.3451017084510097| -1.3451017084510097|
+-----+-----+--------------------+--------------------+
only showing top 5 rows



In [17]:
# F. 
df = df.select('*', (df.n ** 2).alias('n3'))
df.show(5)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [18]:
# G. 
df.group + df.abool

Column<'(group + abool)'>

In [19]:
# H.
df.select(df.group + df.abool)
# it creates an error due to incompatabilities between dtypes

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias((cast(group#1 as double) + abool#2), Some(org.apache.spark.sql.Column$$Lambda$3323/0x00000008012ae040@610d3a3b))]
+- Project [group#1, abool#2, n#0, n2#244, POWER(n#0, cast(2 as double)) AS n3#266]
   +- Project [group#1, abool#2, n#0, (n#0 * cast(-1 as double)) AS n2#244]
      +- LogicalRDD [n#0, group#1, abool#2], false


In [20]:
# I. 
df.select(df.group + df.n).show(5)

+-----------+
|(group + n)|
+-----------+
|       null|
|       null|
|       null|
|       null|
|       null|
+-----------+
only showing top 5 rows



In [21]:
df.select(df.n3 + df.n).show(5)

+--------------------+
|            (n3 + n)|
+--------------------+
|-0.20489020667371294|
|  1.3219301322574657|
|-0.04252255435649...|
|    0.65594672818873|
|   3.154400314528835|
+--------------------+
only showing top 5 rows



In [22]:
df.select(df.n2 + df.n3).show(5)

+--------------------+
|           (n2 + n3)|
+--------------------+
|   1.219891117427463|
|-0.18560262506194025|
|0.046483602319616374|
|-0.24767794930284948|
|  0.4641968976268154|
+--------------------+
only showing top 5 rows



*Data types need to match.*

3. Type casting

    1. Use the starter code above to re-create a spark dataframe.

    1. Use `.printSchema` to view the datatypes in your dataframe.

    1. Use `.dtypes` to view the datatypes in your dataframe.

    1. What is the difference between the two code samples below?

        ```python
        df.abool.cast('int')
        ```

        ```python
        df.select(df.abool.cast('int')).show()
        ```

    1. Use `.select` and `.cast` to convert the `abool` column to an integer
       type. View the results.
    1. Convert the `group` column to a integer data type and view the results.
       What happens?
    1. Convert the `n` column to a integer data type and view the results. What
       happens?
    1. Convert the `abool` column to a string data type and view the results.
       What happens?

In [23]:
# B. 
df.printSchema()

root
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)
 |-- n: double (nullable = true)
 |-- n2: double (nullable = true)
 |-- n3: double (nullable = true)



In [24]:
# C.
df.dtypes

[('group', 'string'),
 ('abool', 'boolean'),
 ('n', 'double'),
 ('n2', 'double'),
 ('n3', 'double')]

In [25]:
# D. 
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [26]:
# Does the same thing as other line but displays the results
df.select(df.abool.cast('int')).show() 

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



In [27]:
# E. 
df.select(df.abool.cast('int')).show(5)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [28]:
# F. 
df.select(df.group.cast('int')).show(5)

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows



In [29]:
# G.
df.select(df.n, df.n.cast('int').alias('n_as_int')).show(5)

+--------------------+--------+
|                   n|n_as_int|
+--------------------+--------+
|  -0.712390662050588|       0|
|   0.753766378659703|       0|
|-0.04450307833805...|       0|
| 0.45181233874578974|       0|
|  1.3451017084510097|       1|
+--------------------+--------+
only showing top 5 rows



In [30]:
# H.
df.select(df.abool, df.abool.cast('string').alias('abool_as_string')).show(5)

+-----+---------------+
|abool|abool_as_string|
+-----+---------------+
|false|          false|
|false|          false|
|false|          false|
|false|          false|
|false|          false|
+-----+---------------+
only showing top 5 rows



4. Built-in Functions

    1. Use the starter code above to re-create a spark dataframe.
    1. Import the necessary functions from `pyspark.sql.functions`
    1. Find the highest `n` value.
    1. Find the lowest `n` value.
    1. Find the average `n` value.
    1. Use `concat` to change the `group` column to say, e.g. "Group: x" or
       "Group: y"
    1. Use `concat` to combine the `n` and `group` columns to produce results
       that look like this: "x: -1.432" or "z: 2.352"

In [33]:
# C. 
df.select(max(df.n)).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



In [34]:
# D. 
df.select(min(df.n)).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



In [35]:
# E. 
df.select(mean(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



In [36]:
# F. 
df.select(concat(lit('Group:'), df.group).alias('new_group')).show(5)

+---------+
|new_group|
+---------+
|  Group:z|
|  Group:x|
|  Group:z|
|  Group:y|
|  Group:z|
+---------+
only showing top 5 rows



In [37]:
# G. 
df.select(concat(df.group, lit(':'), df.n).alias('new_group')).show(5)

+--------------------+
|           new_group|
+--------------------+
|z:-0.712390662050588|
| x:0.753766378659703|
|z:-0.044503078338...|
|y:0.4518123387457...|
|z:1.3451017084510097|
+--------------------+
only showing top 5 rows



5. When / Otherwise

    1. Use the starter code above to re-create a spark dataframe.
    1. Use `when` and `.otherwise` to create a column that contains the text "It
       is true" when `abool` is true and "It is false"" when `abool` is false.
    1. Create a column that contains 0 if n is less than 0, otherwise, the
       original n value.

In [38]:
# B. 
df.select(
    'abool',
    (when(df.abool > True, 'It is true')
     .otherwise('It is false')
     .alias('abool_desc'))
).show(5)

+-----+-----------+
|abool| abool_desc|
+-----+-----------+
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
|false|It is false|
+-----+-----------+
only showing top 5 rows



In [40]:
df.select(
    'n',
    (when(df.n < 0, 0)
     .otherwise(df.n)
     .alias('less_than_0'))
).show(5)

+--------------------+-------------------+
|                   n|        less_than_0|
+--------------------+-------------------+
|  -0.712390662050588|                0.0|
|   0.753766378659703|  0.753766378659703|
|-0.04450307833805...|                0.0|
| 0.45181233874578974|0.45181233874578974|
|  1.3451017084510097| 1.3451017084510097|
+--------------------+-------------------+
only showing top 5 rows



6. Filter / Where

    1. Use the starter code above to re-create a spark dataframe.
    1. Use `.filter` or `.where` to select just the rows where the group is `y`
       and view the results.
    1. Select just the columns where the `abool` column is false and view the
       results.
    1. Find the columns where the `group` column is *not* `y`.
    1. Find the columns where `n` is positive.
    1. Find the columns where `abool` is true and the `group` column is `z`.
    1. Find the columns where `abool` is true or the `group` column is `z`.
    1. Find the columns where `abool` is false and `n` is less than 1
    1. Find the columns where `abool` is false or `n` is less than 1

In [41]:
# B. 
df.where(df.group == 'y').show()

+-----+-----+--------------------+--------------------+-------------------+
|group|abool|                   n|                  n2|                 n3|
+-----+-----+--------------------+--------------------+-------------------+
|    y|false| 0.45181233874578974|-0.45181233874578974|0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463| 0.2833836273138969|
|    y| true| -1.0453771305385342|  1.0453771305385342| 1.0928133450529796|
|    y|false|  -1.261605945319069|   1.261605945319069|  1.591649561264422|
|    y| true|  0.5628467852810314| -0.5628467852810314|0.31679650370119145|
|    y| true|-0.24332625188556253| 0.24332625188556253|0.05920766485667622|
|    y|false|  0.9137407048596775| -0.9137407048596775| 0.8349220757174602|
|    y| true|  2.1503829673811126| -2.1503829673811126|  4.624146906402799|
+-----+-----+--------------------+--------------------+-------------------+



In [42]:
# C. 
df.where(df.abool == 'false').show()

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|

In [43]:
# D. 
df.filter(df.group != 'y').show(5)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [44]:
# E. 
df.filter(df.n > 0).show(5)

+-----+-----+-------------------+--------------------+-------------------+
|group|abool|                  n|                  n2|                 n3|
+-----+-----+-------------------+--------------------+-------------------+
|    x|false|  0.753766378659703|  -0.753766378659703| 0.5681637535977627|
|    y|false|0.45181233874578974|-0.45181233874578974|0.20413438944294027|
|    z|false| 1.3451017084510097| -1.3451017084510097| 1.8092986060778251|
|    y|false| 0.5323378882945463| -0.5323378882945463| 0.2833836273138969|
|    z|false| 1.3501878997225267| -1.3501878997225267| 1.8230073645571279|
+-----+-----+-------------------+--------------------+-------------------+
only showing top 5 rows



In [45]:
# F
df.filter((df.abool == True) & (df.group == 'z')).show()

+-----+-----+------------------+-------------------+------------------+
|group|abool|                 n|                 n2|                n3|
+-----+-----+------------------+-------------------+------------------+
|    z| true|1.4786857374358966|-1.4786857374358966|2.1865115100963415|
+-----+-----+------------------+-------------------+------------------+



In [46]:
# G. 
df.filter((df.abool) | (df.group == 'z')).show(5)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
|    z| true|  1.4786857374358966| -1.4786857374358966|  2.1865115100963415|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [47]:
# H. 
df.filter((~ df.abool) & (df.n < 1)).show(5)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [48]:
# I. 
df.filter((~ df.abool) & (df.n < 1)).show(5)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



7. Sorting

    1. Use the starter code above to re-create a spark dataframe.
    1. Sort by the `n` value.
    1. Sort by the `group` value, both ascending and descending.
    1. Sort by the group value first, then, within each group, sort by `n`
       value.
    1. Sort by `abool`, `group`, and `n`. Does it matter in what order you
       specify the columns when sorting?

In [49]:
# B. 
df.sort('n').show(5)

                                                                                

+-----+-----+--------------------+-------------------+-------------------+
|group|abool|                   n|                 n2|                 n3|
+-----+-----+--------------------+-------------------+-------------------+
|    y|false|  -1.261605945319069|  1.261605945319069|  1.591649561264422|
|    y| true| -1.0453771305385342| 1.0453771305385342| 1.0928133450529796|
|    x|false| -0.7889890249515489| 0.7889890249515489| 0.6225036814939958|
|    z|false|  -0.712390662050588|  0.712390662050588|  0.507500455376875|
|    y| true|-0.24332625188556253|0.24332625188556253|0.05920766485667622|
+-----+-----+--------------------+-------------------+-------------------+
only showing top 5 rows



In [50]:
# C.
df.sort('group').show(10) # ascending

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x| true|-0.02677164998644...|0.026771649986440726|7.167212429964917E-4|
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    y| true|  0.5628467852810314| -0.5628467852810314| 0.31679650370119145|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y| true| -1.0453771305385342|  1.0453771305385342|  1.0928133450529796|

In [51]:
df.sort('group', ascending=False).show(10) # descending

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    z|false| 0.12730328020698067|-0.12730328020698067|0.016206125151457036|
|    z| true|  1.4786857374358966| -1.4786857374358966|  2.1865115100963415|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y| true|  2.1503829673811126| -2.1503829673811126|   4.624146906402799|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|

In [52]:
# D. 
df.sort('group', 'n').show()

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x| true|-0.02677164998644...|0.026771649986440726|7.167212429964917E-4|
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y| true| -1.0453771305385342|  1.0453771305385342|  1.0928133450529796|
|    y| true|-0.24332625188556253| 0.24332625188556253| 0.05920766485667622|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|

In [53]:
# E. 
df.sort('abool', 'group', 'n').show()
## Yes the order matters

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    y|false|  0.9137407048596775| -0.9137407048596775|  0.8349220757174602|
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|

8. Aggregating

    1. What is the average `n` value for each group in the `group` column?
    1. What is the maximum `n` value for each group in the `group` column?
    1. What is the minimum `n` value by `abool`?
    1. What is the average `n` value for each unique combination of the `group`
       and `abool` column?

In [54]:
# A. 
df.groupby('group').agg(mean('n')).show()

[Stage 89:>                                                         (0 + 8) / 8]

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    x|0.2871427762539448|
|    z| 0.590730814237962|
|    y| 0.257601419602374|
+-----+------------------+



                                                                                

In [55]:
# B. 
df.groupby('group').agg(max('n')).show()

[Stage 92:>                                                         (0 + 8) / 8]

+-----+------------------+
|group|            max(n)|
+-----+------------------+
|    x|0.8612113741693206|
|    z|1.4786857374358966|
|    y|2.1503829673811126|
+-----+------------------+



                                                                                

In [56]:
# C. 
df.groupby('abool').agg(min('n')).show()

[Stage 95:>                                                         (0 + 8) / 8]

+-----+-------------------+
|abool|             min(n)|
+-----+-------------------+
|false| -1.261605945319069|
| true|-1.0453771305385342|
+-----+-------------------+



                                                                                

In [57]:
# D. 
df.groupby('group', 'abool').agg(mean('n')).show()

[Stage 98:>                                                         (0 + 8) / 8]

+-----+-----+--------------------+
|group|abool|              avg(n)|
+-----+-----+--------------------+
|    z|false| 0.41313982959837514|
|    x|false|  0.3499256615020219|
|    y|false| 0.15907124664523611|
|    y| true| 0.35613159255951177|
|    z| true|  1.4786857374358966|
|    x| true|-0.02677164998644...|
+-----+-----+--------------------+



                                                                                

9. Spark SQL

    1. Use the starter code above to re-create a spark dataframe.
    1. Turn your dataframe into a table that can be queried with spark SQL. Name
       the table `my_df`. Answer the rest of the questions in this section with
       a spark sql query (`spark.sql`) against `my_df`. After each step, view
       the first 7 records from the dataframe.
    1. What happens if you make a SQL syntax error in your query?
    1. Write a query that shows all of the columns from your dataframe.
    1. Write a query that shows just the `n` and `abool` columns from the
       dataframe.
    1. Write a query that shows just the `n` and `group` columns. Rename the
       `group` column to `g`.
    1. Write a query that selects `n`, and creates two new columns: `n2`, the
       original `n` values halved, and `n3`: the original n values minus 1.

In [58]:
# B. 
df.createOrReplaceTempView('my_df')

In [66]:
# C. 
spark.sql('''SELECT n, n2n3, FROM my_df''')

AnalysisException: cannot resolve 'n' given input columns: []; line 1 pos 7;
'Project ['n, 'n2n3, 'FROM AS my_df#834]
+- OneRowRelation


In [67]:
# D. 
spark.sql('''SELECT * FROM my_df''').show(7)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 7 rows



In [68]:
# E. 
spark.sql('''SELECT n, abool FROM my_df''').show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
+--------------------+-----+
only showing top 7 rows



In [69]:
# F.
spark.sql('''SELECT n, group as g FROM my_df''').show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
+--------------------+---+
only showing top 7 rows



In [70]:
# G. 
spark.sql('''SELECT n, 
                    n / 2 as n2,
                    n - 1 as n3
                FROM my_df'''
         ).show(7)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
+--------------------+--------------------+--------------------+
only showing top 7 rows

