In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
#Define a schema using DDL(Data definition language)
schema = "`ID` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT,`Campaigns` ARRAY<STRING>"

In [3]:
#Define a schema using Dataframe API
schema_api = StructType([StructField("ID", IntegerType(), False),
                             StructField("First", StringType(), False),
                             StructField('Last', StringType(), False),
                             StructField('Url', StringType(),False),
                             StructField('Published', StringType(),False),
                             StructField('Hits', IntegerType(),False),
                             StructField('Campaigns',ArrayType(StringType(),True),False)])

In [4]:
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
[2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter",
"LinkedIn"]],
[3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web",
"twitter", "FB", "LinkedIn"]],
[4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,
["twitter", "FB"]],
[5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web",
"twitter", "FB", "LinkedIn"]],
[6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,
["twitter", "LinkedIn"]]
]
                                                                      

In [5]:
spark = (SparkSession.builder.appName('Example-3-6').getOrCreate())

In [6]:
blog_df = spark.createDataFrame(data, schema_api)
blog_df.show()
print(blog_df.printSchema())

+---+---------+-------+-----------------+---------+-----+--------------------+
| ID|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

root
 |-- ID: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: inte

In [8]:
from pyspark.sql.functions import *

In [9]:
blog_df.columns

['ID', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']

In [10]:
blog_df.select(expr("Hits*2")).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [11]:
#This a new column, BigHitter, base on conditional expression
blog_df.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| ID|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [12]:
#Concatenate, show the new created concateneted column
new_df =(blog_df.withColumn('AuthorsId', (concat(expr('First'),expr('Last'), expr('ID')))))

In [37]:
#Sort by ID
blog_df.sort(col('ID').desc()).show(truncate=False)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|ID |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+



In [28]:
blog_df['Id']

Column<'Id'>

In [17]:
blog_df.write.format("com.databricks.spark.csv").option("header", "true").save("mydata.csv")

AnalysisException: CSV data source does not support array<string> data type.