# Hello PySpark Columnn, Rows, and Expressions

In [1]:
# Prerequisites
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


In [3]:
# Define Schema and Instantiate some Data
schema = StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)])

data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [4]:
df_blogs = spark.createDataFrame(data, schema)
df_blogs.printSchema()

root
 |-- Id: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: integer (nullable = false)
 |-- Campaigns: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [5]:
df_blogs.show(truncate=False)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+



### Using expressions on columns

In [7]:
df_blogs.select("Last", expr("Hits")*2).show()

+-------+----------+
|   Last|(Hits * 2)|
+-------+----------+
|  Damji|      9070|
|  Wenig|     17816|
|    Lee|     15318|
|    Das|     21136|
|Zaharia|     81156|
|    Xin|     51136|
+-------+----------+



In [8]:
df_blogs.select("Last", col("Hits")*2).show()

+-------+----------+
|   Last|(Hits * 2)|
+-------+----------+
|  Damji|      9070|
|  Wenig|     17816|
|    Lee|     15318|
|    Das|     21136|
|Zaharia|     81156|
|    Xin|     51136|
+-------+----------+



In [9]:
df_blogs.select("Last", expr("Hits * 2")).show()

+-------+----------+
|   Last|(Hits * 2)|
+-------+----------+
|  Damji|      9070|
|  Wenig|     17816|
|    Lee|     15318|
|    Das|     21136|
|Zaharia|     81156|
|    Xin|     51136|
+-------+----------+



In [10]:
# Display Big Hitters
df_blogs.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



### Concatenate Columns

In [11]:
(df_blogs.withColumn("AuthorsId", (concat(df_blogs["First"], df_blogs["Last"], df_blogs["Id"])))
        .select("Id", "AuthorsId")
        .show())

+---+-------------+
| Id|    AuthorsId|
+---+-------------+
|  1|  JulesDamji1|
|  2| BrookeWenig2|
|  3|    DennyLee3|
|  4|TathagataDas4|
|  5|MateiZaharia5|
|  6|  ReynoldXin6|
+---+-------------+



### Row Objects

In [15]:
from pyspark.sql import Row

blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", ["twitter", "LinkedIn"])
print("Row: ", blog_row)
for idx, _ in enumerate(blog_row):
    print(f"Index [{idx}] = {blog_row[idx]}")


Row:  <Row(6, 'Reynold', 'Xin', 'https://tinyurl.6', 255568, '3/2/2015', ['twitter', 'LinkedIn'])>
Index [0] = 6
Index [1] = Reynold
Index [2] = Xin
Index [3] = https://tinyurl.6
Index [4] = 255568
Index [5] = 3/2/2015
Index [6] = ['twitter', 'LinkedIn']


In [16]:
# Create a dataframe from rows
rows = [Row("Bill Smith", "CA"), Row("John Scott", "WA")]
df_authors = spark.createDataFrame(rows, ["Author", "State"])
df_authors.show()


+----------+-----+
|    Author|State|
+----------+-----+
|Bill Smith|   CA|
|John Scott|   WA|
+----------+-----+

