In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

In [14]:
"""
There are two ways to define a schema in Spark:
    1. Programatically
    2. DDL
"""

# 1. Programatically
schema1 = StructType([\
    StructField("author", StringType(), False),
    StructField("title", StringType(), False),
    StructField("pages", IntegerType(), False)\
    ])

# 2. DDL
schema2 = "author STRING, title STRING, pages INT"

In [16]:
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web", "twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]]

data_schema = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"
data_df = spark.createDataFrame(data, schema = data_schema)

In [27]:
data_schema2 = StructType([\
    StructField("Id", IntegerType(), False),
    StructField("First_name", StringType(), False),
    StructField("Last_name", StringType(), False),
    StructField("web", StringType(), False),
    StructField("web", StringType(), False),
    StructField("Hits", IntegerType(), False),
    StructField("Campaigns", ArrayType(StringType(), False), False),
    ])

data_df = spark.createDataFrame(data, schema = data_schema2)
data_df.schema

StructType(List(StructField(Id,IntegerType,false),StructField(First_name,StringType,false),StructField(Last_name,StringType,false),StructField(web,StringType,false),StructField(web,StringType,false),StructField(Hits,IntegerType,false),StructField(Campaigns,ArrayType(StringType,false),false)))

In [28]:
# show
data_df.show()

+---+----------+---------+-----------------+---------+-----+--------------------+
| Id|First_name|Last_name|              web|      web| Hits|           Campaigns|
+---+----------+---------+-----------------+---------+-----+--------------------+
|  1|     Jules|    Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|    Brooke|    Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|     Denny|      Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4| Tathagata|      Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|     Matei|  Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|   Reynold|      Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+----------+---------+-----------------+---------+-----+--------------------+



In [29]:
# show schema
data_df.printSchema()

root
 |-- Id: integer (nullable = false)
 |-- First_name: string (nullable = false)
 |-- Last_name: string (nullable = false)
 |-- web: string (nullable = false)
 |-- web: string (nullable = false)
 |-- Hits: integer (nullable = false)
 |-- Campaigns: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [38]:
from pyspark.sql import *
from pyspark.sql.functions import *

In [71]:
data_schema2 = StructType([\
    StructField("Id", IntegerType(), False),
    StructField("First_name", StringType(), False),
    StructField("Last_name", StringType(), False),
    StructField("web", StringType(), False),
    StructField("web", StringType(), False),
    StructField("Hits", IntegerType(), False),
    StructField("Campaigns", ArrayType(StringType(), False), False),
    ])

data_df = spark.createDataFrame(data, schema = data_schema2)

In [55]:
# Operations over columns

# Use an expression to compute big hitters for blogs
# This adds a new column, Big Hitters, based on the conditional expression

#Concatenate three columns, create a new column, and show the
# newly created concatenated column

+---+----------+---------+-----------------+---------+-----+--------------------+---------+
| Id|First_name|Last_name|              web|      web| Hits|           Campaigns|double_Id|
+---+----------+---------+-----------------+---------+-----+--------------------+---------+
|  1|     Jules|    Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|    false|
|  2|    Brooke|    Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|    false|
|  3|     Denny|      Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|     true|
|  4| Tathagata|      Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|     true|
|  5|     Matei|  Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|     true|
|  6|   Reynold|      Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|     true|
+---+----------+---------+-----------------+---------+-----+--------------------+---------+



In [86]:
blog_row = [Row(6, "Reynold")]
data_schema = "`Id` INT, `First` STRING"
d = spark.createDataFrame(blog_row, schema=data_schema)

In [87]:
d.show()

+---+-------+
| Id|  First|
+---+-------+
|  6|Reynold|
+---+-------+

