In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/14 15:32:20 WARN Utils: Your hostname, kirans-mac.local, resolves to a loopback address: 127.0.0.1; using 172.18.197.149 instead (on interface en0)
25/06/14 15:32:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/14 15:32:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/14 15:32:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Create dataFrame

In [3]:
import datetime
from pyspark.sql import Row

users = [{'id':1,
        'first_name':'kiran',
        'last_name':'chinta',
         'email':'k.c@gmail.com',
         'phone_numbers':Row(mobile=123,home=456),
         'courses':[1,2],
         'is_customer':True,
         'amount_paid':100.55,
         'customer_from':datetime.date(2012,1,1),
        'last_updated_ts':datetime.datetime(2021,2,2,1,2,3)
        },
        {'id':2,
        'first_name':'goats',
        'last_name':'manchi',
         'email':'g.m@gmail.com',
         'phone_numbers':Row(mobile=246,home=680),
         'courses':[11,2],
         'is_customer':False,
         'amount_paid':110.55,
         'customer_from':datetime.date(2012,1,1),
        'last_updated_ts':datetime.datetime(2021,2,2,1,2,3)
        },
        {'id':3,
        'first_name':'ss',
        'last_name':'raj',
         'email':'s.s.r@gmail.com',
         'phone_numbers':Row(mobile=135,home=579),
         'courses':[11,22],
         'is_customer':False,
         'amount_paid':None,
         'customer_from':datetime.date(2012,2,2),
        'last_updated_ts':datetime.datetime(2021,4,5,1,2,3)
        }]

In [4]:
u_df = spark.createDataFrame(users)
u_df.show()

+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|amount_paid| courses|customer_from|          email|first_name| id|is_customer|last_name|    last_updated_ts|phone_numbers|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|     100.55|  [1, 2]|   2012-01-01|  k.c@gmail.com|     kiran|  1|       true|   chinta|2021-02-02 01:02:03|   {123, 456}|
|     110.55| [11, 2]|   2012-01-01|  g.m@gmail.com|     goats|  2|      false|   manchi|2021-02-02 01:02:03|   {246, 680}|
|       NULL|[11, 22]|   2012-02-02|s.s.r@gmail.com|        ss|  3|      false|      raj|2021-04-05 01:02:03|   {135, 579}|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+



In [5]:
u_df.printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: long (nullable = true)
 |    |-- home: long (nullable = true)



# Renaming using withColumnRenamed

In [7]:
u_df.\
    select('id','first_name','last_name').\
    withColumnRenamed('id','user_id').\
    withColumnRenamed('first_name','user_first_name').\
    withColumnRenamed('last_name','user_last_name').\
    show()

+-------+---------------+--------------+
|user_id|user_first_name|user_last_name|
+-------+---------------+--------------+
|      1|          kiran|        chinta|
|      2|          goats|        manchi|
|      3|             ss|           raj|
+-------+---------------+--------------+



# Renaming using alias

In [11]:
from pyspark.sql.functions import col, concat,lit

u_df.\
    select(col('id').alias('user_id'),
           col('first_name').alias('user_first_name'),
           col('last_name').alias('user_last_name'),
          concat(col('first_name'),lit('-'),col('last_name')).alias('full-name')).\
    show()

+-------+---------------+--------------+------------+
|user_id|user_first_name|user_last_name|   full-name|
+-------+---------------+--------------+------------+
|      1|          kiran|        chinta|kiran-chinta|
|      2|          goats|        manchi|goats-manchi|
|      3|             ss|           raj|      ss-raj|
+-------+---------------+--------------+------------+



# Renaming using withColumn

In [14]:
# withColumn for  renaming
u_df.\
    select('id','first_name','last_name').\
    withColumn('user_id',col('id')).\
    withColumn('user_first_name',col('first_name')).\
    withColumn('user_last_name',col('last_name')).\
    show()

+---+----------+---------+-------+---------------+--------------+
| id|first_name|last_name|user_id|user_first_name|user_last_name|
+---+----------+---------+-------+---------------+--------------+
|  1|     kiran|   chinta|      1|          kiran|        chinta|
|  2|     goats|   manchi|      2|          goats|        manchi|
|  3|        ss|      raj|      3|             ss|           raj|
+---+----------+---------+-------+---------------+--------------+



In [19]:
# withColumn for transformation

from pyspark.sql.functions import size

u_df.\
    select('id','first_name','last_name','courses').\
    withColumn('full_name',concat(col('first_name'),lit('-'),col('last_name'))).\
    show()

+---+----------+---------+--------+------------+
| id|first_name|last_name| courses|   full_name|
+---+----------+---------+--------+------------+
|  1|     kiran|   chinta|  [1, 2]|kiran-chinta|
|  2|     goats|   manchi| [11, 2]|goats-manchi|
|  3|        ss|      raj|[11, 22]|      ss-raj|
+---+----------+---------+--------+------------+



In [16]:
# withColumn for transformation

from pyspark.sql.functions import size

u_df.\
    select('id','first_name','last_name','courses').\
    withColumn('course_count',size(col('courses'))).\
    show()

+---+----------+---------+--------+------------+
| id|first_name|last_name| courses|course_count|
+---+----------+---------+--------+------------+
|  1|     kiran|   chinta|  [1, 2]|           2|
|  2|     goats|   manchi| [11, 2]|           2|
|  3|        ss|      raj|[11, 22]|           2|
+---+----------+---------+--------+------------+



# Rename all columns

In [21]:
u_df.columns

['amount_paid',
 'courses',
 'customer_from',
 'email',
 'first_name',
 'id',
 'is_customer',
 'last_name',
 'last_updated_ts',
 'phone_numbers']

In [22]:
updated_cols = ['user_amount_paid',
 'user_courses',
 'user_customer_from',
 'user_email',
 'user_first_name',
 'user_id',
 'user_is_customer',
 'user_last_name',
 'user_last_updated_ts',
 'user_phone_numbers']

In [23]:
u_df.\
    select(u_df.columns).\
    toDF(*updated_cols).\
    show()

+----------------+------------+------------------+---------------+---------------+-------+----------------+--------------+--------------------+------------------+
|user_amount_paid|user_courses|user_customer_from|     user_email|user_first_name|user_id|user_is_customer|user_last_name|user_last_updated_ts|user_phone_numbers|
+----------------+------------+------------------+---------------+---------------+-------+----------------+--------------+--------------------+------------------+
|          100.55|      [1, 2]|        2012-01-01|  k.c@gmail.com|          kiran|      1|            true|        chinta| 2021-02-02 01:02:03|        {123, 456}|
|          110.55|     [11, 2]|        2012-01-01|  g.m@gmail.com|          goats|      2|           false|        manchi| 2021-02-02 01:02:03|        {246, 680}|
|            NULL|    [11, 22]|        2012-02-02|s.s.r@gmail.com|             ss|      3|           false|           raj| 2021-04-05 01:02:03|        {135, 579}|
+----------------+----

# spark stop

In [24]:
spark.stop()