In [1]:
%run "./02 Creating Spark Data Frame to Select and Rename Columns.ipynb"



+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

                                                                                

In [2]:
users_df. \
    select('id', 'first_name', 'last_name'). \
    show()

[Stage 2:>                                                          (0 + 1) / 1]

+---+----------+------------+
| id|first_name|   last_name|
+---+----------+------------+
|  1|    Corrie|Van den Oord|
|  2|  Nikolaus|     Brewitt|
|  3|    Orelie|      Penney|
|  4|     Ashby|    Maddocks|
|  5|      Kurt|        Rome|
+---+----------+------------+



                                                                                

* Concat `first_name` and `last_name`. Provide an alias to the derived result as `full_name`

In [3]:
from pyspark.sql.functions import concat, lit

In [4]:
# Equivalent logic using select
users_df. \
    select(
        'id', 'first_name', 'last_name',
        concat('first_name', lit(', '), 'last_name').alias('full_name')
    ). \
    show()

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [5]:
users_df. \
    select('id', 'first_name', 'last_name'). \
    withColumn('full_name', concat('first_name', lit(', '), 'last_name')). \
    show()

+---+----------+------------+--------------------+
| id|first_name|   last_name|           full_name|
+---+----------+------------+--------------------+
|  1|    Corrie|Van den Oord|Corrie, Van den Oord|
|  2|  Nikolaus|     Brewitt|   Nikolaus, Brewitt|
|  3|    Orelie|      Penney|      Orelie, Penney|
|  4|     Ashby|    Maddocks|     Ashby, Maddocks|
|  5|      Kurt|        Rome|          Kurt, Rome|
+---+----------+------------+--------------------+



In [6]:
help(users_df.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName, col) method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to add multiple columns can generate big
    plans which can cause performance issues and even `StackOverflowException`.
    To avoid this, use :func:`select` with the multiple columns at once.
    
    Examples
    

In [7]:
from pyspark.sql.functions import col

In [8]:
users_df. \
    select('id', 'first_name', 'last_name'). \
    withColumn('fn', users_df['first_name']). \
    show()

+---+----------+------------+--------+
| id|first_name|   last_name|      fn|
+---+----------+------------+--------+
|  1|    Corrie|Van den Oord|  Corrie|
|  2|  Nikolaus|     Brewitt|Nikolaus|
|  3|    Orelie|      Penney|  Orelie|
|  4|     Ashby|    Maddocks|   Ashby|
|  5|      Kurt|        Rome|    Kurt|
+---+----------+------------+--------+



* Add another column by name `course_count` where it contain number of courses the user is enrolled for.

In [9]:
users_df.select('id', 'courses'). \
    show()

+---+-------+
| id|courses|
+---+-------+
|  1| [1, 2]|
|  2|    [3]|
|  3| [2, 4]|
|  4|     []|
|  5|     []|
+---+-------+



In [10]:
users_df.select('id', 'courses'). \
    dtypes

[('id', 'bigint'), ('courses', 'array<bigint>')]

In [11]:
from pyspark.sql.functions import size

In [12]:
users_df.select('id', 'courses'). \
    withColumn('course_count', size('courses')). \
    show()

+---+-------+------------+
| id|courses|course_count|
+---+-------+------------+
|  1| [1, 2]|           2|
|  2|    [3]|           1|
|  3| [2, 4]|           2|
|  4|     []|           0|
|  5|     []|           0|
+---+-------+------------+

