In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/14 15:29:34 WARN Utils: Your hostname, kirans-mac.local, resolves to a loopback address: 127.0.0.1; using 172.18.197.149 instead (on interface en0)
25/06/14 15:29:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/14 15:29:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/14 15:29:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Create DataFrame

In [3]:
import datetime
from pyspark.sql import Row

users = [{'id':1,
        'first_name':'kiran',
        'last_name':'chinta',
         'email':'k.c@gmail.com',
         'phone_numbers':Row(mobile=123,home=456),
         'courses':[1,2],
         'is_customer':True,
         'amount_paid':100.55,
         'customer_from':datetime.date(2012,1,1),
        'last_updated_ts':datetime.datetime(2021,2,2,1,2,3)
        },
        {'id':2,
        'first_name':'goats',
        'last_name':'manchi',
         'email':'g.m@gmail.com',
         'phone_numbers':Row(mobile=246,home=680),
         'courses':[11,2],
         'is_customer':False,
         'amount_paid':110.55,
         'customer_from':datetime.date(2012,1,1),
        'last_updated_ts':datetime.datetime(2021,2,2,1,2,3)
        },
        {'id':3,
        'first_name':'ss',
        'last_name':'raj',
         'email':'s.s.r@gmail.com',
         'phone_numbers':Row(mobile=135,home=579),
         'courses':[11,22],
         'is_customer':False,
         'amount_paid':None,
         'customer_from':datetime.date(2012,2,2),
        'last_updated_ts':datetime.datetime(2021,4,5,1,2,3)
        }]

In [4]:
u_df = spark.createDataFrame(users)
u_df.show()

+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|amount_paid| courses|customer_from|          email|first_name| id|is_customer|last_name|    last_updated_ts|phone_numbers|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|     100.55|  [1, 2]|   2012-01-01|  k.c@gmail.com|     kiran|  1|       true|   chinta|2021-02-02 01:02:03|   {123, 456}|
|     110.55| [11, 2]|   2012-01-01|  g.m@gmail.com|     goats|  2|      false|   manchi|2021-02-02 01:02:03|   {246, 680}|
|       NULL|[11, 22]|   2012-02-02|s.s.r@gmail.com|        ss|  3|      false|      raj|2021-04-05 01:02:03|   {135, 579}|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+



In [5]:
u_df.printSchema()

root
 |-- amount_paid: double (nullable = true)
 |-- courses: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_from: date (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- is_customer: boolean (nullable = true)
 |-- last_name: string (nullable = true)
 |-- last_updated_ts: timestamp (nullable = true)
 |-- phone_numbers: struct (nullable = true)
 |    |-- mobile: long (nullable = true)
 |    |-- home: long (nullable = true)



# Selecting Columns

In [6]:
u_df.select('*').show()

+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|amount_paid| courses|customer_from|          email|first_name| id|is_customer|last_name|    last_updated_ts|phone_numbers|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|     100.55|  [1, 2]|   2012-01-01|  k.c@gmail.com|     kiran|  1|       true|   chinta|2021-02-02 01:02:03|   {123, 456}|
|     110.55| [11, 2]|   2012-01-01|  g.m@gmail.com|     goats|  2|      false|   manchi|2021-02-02 01:02:03|   {246, 680}|
|       NULL|[11, 22]|   2012-02-02|s.s.r@gmail.com|        ss|  3|      false|      raj|2021-04-05 01:02:03|   {135, 579}|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+



In [7]:
u_df.select('id','last_name','phone_numbers').show()

+---+---------+-------------+
| id|last_name|phone_numbers|
+---+---------+-------------+
|  1|   chinta|   {123, 456}|
|  2|   manchi|   {246, 680}|
|  3|      raj|   {135, 579}|
+---+---------+-------------+



In [8]:
# selecting columns using col
from pyspark.sql.functions import col

u_df.select(col('id'),col('last_name'),col('phone_numbers')).show()

+---+---------+-------------+
| id|last_name|phone_numbers|
+---+---------+-------------+
|  1|   chinta|   {123, 456}|
|  2|   manchi|   {246, 680}|
|  3|      raj|   {135, 579}|
+---+---------+-------------+



In [9]:
# selecting columns using df

u_df.select(u_df['id'],u_df['last_name'],u_df['phone_numbers']).show()

+---+---------+-------------+
| id|last_name|phone_numbers|
+---+---------+-------------+
|  1|   chinta|   {123, 456}|
|  2|   manchi|   {246, 680}|
|  3|      raj|   {135, 579}|
+---+---------+-------------+



In [10]:
# concat, list, alias

from pyspark.sql.functions import col, concat, lit

u_df.select(col('id'),col('last_name'),col('phone_numbers'),concat(col('first_name'),lit('#'),col('last_name')).alias('full_name')).show()

+---+---------+-------------+------------+
| id|last_name|phone_numbers|   full_name|
+---+---------+-------------+------------+
|  1|   chinta|   {123, 456}|kiran#chinta|
|  2|   manchi|   {246, 680}|goats#manchi|
|  3|      raj|   {135, 579}|      ss#raj|
+---+---------+-------------+------------+



# overview of SELECTEXPR

In [11]:
u_df.selectExpr('*').show()

+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|amount_paid| courses|customer_from|          email|first_name| id|is_customer|last_name|    last_updated_ts|phone_numbers|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+
|     100.55|  [1, 2]|   2012-01-01|  k.c@gmail.com|     kiran|  1|       true|   chinta|2021-02-02 01:02:03|   {123, 456}|
|     110.55| [11, 2]|   2012-01-01|  g.m@gmail.com|     goats|  2|      false|   manchi|2021-02-02 01:02:03|   {246, 680}|
|       NULL|[11, 22]|   2012-02-02|s.s.r@gmail.com|        ss|  3|      false|      raj|2021-04-05 01:02:03|   {135, 579}|
+-----------+--------+-------------+---------------+----------+---+-----------+---------+-------------------+-------------+



In [12]:
u_df.selectExpr('id','first_name','phone_numbers').show()

+---+----------+-------------+
| id|first_name|phone_numbers|
+---+----------+-------------+
|  1|     kiran|   {123, 456}|
|  2|     goats|   {246, 680}|
|  3|        ss|   {135, 579}|
+---+----------+-------------+



In [13]:
u_df.selectExpr('id',"concat(first_name,'-',last_name)",'phone_numbers').show()

+---+--------------------------------+-------------+
| id|concat(first_name, -, last_name)|phone_numbers|
+---+--------------------------------+-------------+
|  1|                    kiran-chinta|   {123, 456}|
|  2|                    goats-manchi|   {246, 680}|
|  3|                          ss-raj|   {135, 579}|
+---+--------------------------------+-------------+



# stop spark

In [14]:
spark.stop()