In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName('Struct Example').getOrCreate()

In [3]:
schema = StructType([StructField("Id",IntegerType(),False),
                     StructField("First",StringType(),False),
                     StructField("Last",StringType(),False),
                     StructField("Url",StringType(),False),
                     StructField("Published",StringType(),False),
                     StructField("Hits",IntegerType(),False),
                     StructField("Campaigns",ArrayType(StringType()),False)
                    ])

data_schema = "id INT, first STRING, last STRING, url STRING, published STRING, hits INT, campaigns Array<STRING>"

In [4]:
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [5]:
df_schema = spark.createDataFrame(data,schema=schema)

In [6]:
df_ddl = spark.createDataFrame(data,schema=data_schema)

In [7]:
df_schema.printSchema()

root
 |-- Id: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: integer (nullable = false)
 |-- Campaigns: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [8]:
df_ddl.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- url: string (nullable = true)
 |-- published: string (nullable = true)
 |-- hits: integer (nullable = true)
 |-- campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [9]:
df_schema.createOrReplaceTempView('blogs')

In [10]:
spark.sql('SELECT * FROM blogs').show(10,False)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+



In [12]:
df_schema.select(expr("Hits*2")).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [13]:
df_schema.select(expr("Hits") + expr("Id")).show()

+-----------+
|(Hits + Id)|
+-----------+
|       4536|
|       8910|
|       7662|
|      10572|
|      40583|
|      25574|
+-----------+



In [15]:
df_schema.withColumn("Big Hitters",col("Hits")>10000).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [25]:
df_schema.withColumn("Author Name",concat(expr("First"),lit(" "),expr("Last"))).drop("First","Last").show()

+---+-----------------+---------+-----+--------------------+-------------+
| Id|              Url|Published| Hits|           Campaigns|  Author Name|
+---+-----------------+---------+-----+--------------------+-------------+
|  1|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|  Jules Damji|
|  2|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]| Brooke Wenig|
|  3|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|    Denny Lee|
|  4|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|Tathagata Das|
|  5|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|Matei Zaharia|
|  6|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|  Reynold Xin|
+---+-----------------+---------+-----+--------------------+-------------+

