In [1]:
%run "./02 Creating Spark Data Frame to Select and Rename Columns.ipynb"

[Stage 1:>                                                          (0 + 1) / 1]

+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
| id|first_name|   last_name|               email|       phone_numbers|courses|is_customer|amount_paid|customer_from|    last_updated_ts|
+---+----------+------------+--------------------+--------------------+-------+-----------+-----------+-------------+-------------------+
|  1|    Corrie|Van den Oord|cvandenoord0@etsy...|{+1 234 567 8901,...| [1, 2]|       true|    1000.55|   2021-01-15|2021-02-10 01:15:00|
|  2|  Nikolaus|     Brewitt|nbrewitt1@dailyma...|{+1 234 567 8923,...|    [3]|       true|      900.0|   2021-02-14|2021-02-18 03:33:00|
|  3|    Orelie|      Penney|openney2@vistapri...|{+1 714 512 9752,...| [2, 4]|       true|     850.55|   2021-01-21|2021-03-15 15:16:55|
|  4|     Ashby|    Maddocks|  amaddocks3@home.pl|        {null, null}|     []|      false|        NaN|         null|2021-04-10 17:45:30|
|  5|      Kurt|        Rome|krome

                                                                                

* Concatenate `first_name` and `last_name` to generate `full_name`

In [2]:
from pyspark.sql.functions import col, lit, concat

In [3]:
help(concat)

Help on function concat in module pyspark.sql.functions:

concat(*cols)
    Concatenates multiple input columns together into a single column.
    The function works with strings, binary and compatible array columns.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
    >>> df.select(concat(df.s, df.d).alias('s')).collect()
    [Row(s='abcd123')]
    
    >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
    >>> df.select(concat(df.a, df.b, df.c).alias("arr")).collect()
    [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]



In [4]:
full_name_col = concat(col('first_name'), lit(', '), col('last_name'))

In [5]:
full_name_col

Column<'concat(first_name, , , last_name)'>

In [6]:
full_name_alias = full_name_col.alias('full_name')

In [7]:
type(full_name_alias)

pyspark.sql.column.Column

In [8]:
users_df.select('id', full_name_alias).show()

+---+--------------------+
| id|           full_name|
+---+--------------------+
|  1|Corrie, Van den Oord|
|  2|   Nikolaus, Brewitt|
|  3|      Orelie, Penney|
|  4|     Ashby, Maddocks|
|  5|          Kurt, Rome|
+---+--------------------+



* Convert data type of customer_from date to numeric type

In [9]:
users_df.select('id', 'customer_from').show()

[Stage 4:>                                                          (0 + 1) / 1]

+---+-------------+
| id|customer_from|
+---+-------------+
|  1|   2021-01-15|
|  2|   2021-02-14|
|  3|   2021-01-21|
|  4|         null|
|  5|         null|
+---+-------------+



                                                                                

In [10]:
from pyspark.sql.functions import date_format

In [11]:
date_format('customer_from', 'yyyyMMdd')

Column<'date_format(customer_from, yyyyMMdd)'>

In [12]:
date_format('customer_from', 'yyyyMMdd').cast('int')

Column<'CAST(date_format(customer_from, yyyyMMdd) AS INT)'>

In [13]:
date_format('customer_from', 'yyyyMMdd').cast('int').alias('customer_from')

Column<'CAST(date_format(customer_from, yyyyMMdd) AS INT) AS `customer_from`'>

In [14]:
customer_from_alias = date_format('customer_from', 'yyyyMMdd').cast('int').alias('customer_from')

In [15]:
users_df.select('id', customer_from_alias).show()

+---+-------------+
| id|customer_from|
+---+-------------+
|  1|     20210115|
|  2|     20210214|
|  3|     20210121|
|  4|         null|
|  5|         null|
+---+-------------+



In [16]:
users_df.select('id', customer_from_alias).dtypes

[('id', 'bigint'), ('customer_from', 'int')]